462 files changed, 38094 insertions, 20566 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 31fa15a2042..c3305ac3dd8 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 # External Libraries
 
 include(cmake/external_libs.cmake)
+include(cmake/macros.cmake)
 
 # Build Flags
 # todo: this code could be refactored a bit to avoid duplication
@@ -21,7 +22,11 @@ if(WITH_CYCLES_NATIVE_ONLY)
 	add_definitions(
 		-DWITH_KERNEL_NATIVE
 	)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+
+	if(NOT MSVC)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+		set(CYCLES_KERNEL_FLAGS "-march=native")
+	endif()
 elseif(NOT WITH_CPU_SSE)
 	set(CXX_HAS_SSE FALSE)
 	set(CXX_HAS_AVX FALSE)
@@ -40,57 +45,113 @@ elseif(WIN32 AND MSVC)
 		set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
 	endif()
 
+	# Unlike GCC/clang we still use fast math, because there is no fine
+	# grained control and the speedup we get here is too big to ignore.
+	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+
 	# there is no /arch:SSE3, but intrinsics are available anyway
 	if(CMAKE_CL_64)
-		set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	else()
-		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	endif()
 
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
-elseif(CMAKE_COMPILER_IS_GNUCC)
+elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
-	if(CXX_HAS_SSE)
-		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
-		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
-		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
-	endif()
-	if(CXX_HAS_AVX)
-		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+
+	# Assume no signal trapping for better code generation.
+	set(CYCLES_KERNEL_FLAGS "-fno-trapping-math")
+	# Avoid overhead of setting errno for NaNs.
+	set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno")
+	# Let compiler optimize 0.0 - x without worrying about signed zeros.
+	set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros")
+
+	if(CMAKE_COMPILER_IS_GNUCC)
+		# Assume no signal trapping for better code generation.
+		set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans")
+		# Assume a fixed rounding mode for better constant folding.
+		set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math")
 	endif()
-	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
+
+	if(CXX_HAS_SSE)
+		if(CMAKE_COMPILER_IS_GNUCC)
+			set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse")
+		endif()
+
+		set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
+		set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3")
+		set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1")
+		if(CXX_HAS_AVX)
+			set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx")
+		endif()
+		if(CXX_HAS_AVX2)
+			set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+		endif()
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
-	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
-	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+	check_cxx_compiler_flag(/QxSSE2 CXX_HAS_SSE)
+	check_cxx_compiler_flag(/arch:AVX CXX_HAS_AVX)
+	check_cxx_compiler_flag(/QxCORE-AVX2 CXX_HAS_AVX2)
+
 	if(CXX_HAS_SSE)
-		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
-		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
-		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
+		set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
+		set(CYCLES_SSE3_KERNEL_FLAGS "/QxSSSE3")
+		set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1")
+
+		if(CXX_HAS_AVX)
+			set(CYCLES_AVX_KERNEL_FLAGS "/arch:AVX")
+		endif()
+
+		if(CXX_HAS_AVX2)
+			set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2")
+		endif()
 	endif()
-	if(CXX_HAS_AVX)
-		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+	if(APPLE)
+		# ICC does not support SSE2 flag on MacOSX
+		check_cxx_compiler_flag(-xssse3 CXX_HAS_SSE)
+	else()
+		check_cxx_compiler_flag(-xsse2 CXX_HAS_SSE)
 	endif()
-	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+
+	check_cxx_compiler_flag(-xavx CXX_HAS_AVX)
+	check_cxx_compiler_flag(-xcore-avx2 CXX_HAS_AVX2)
+
+	if(CXX_HAS_SSE)
+		if(APPLE)
+			# ICC does not support SSE2 flag on MacOSX
+			set(CYCLES_SSE2_KERNEL_FLAGS "-xssse3")
+		else()
+			set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
+		endif()
+
+		set(CYCLES_SSE3_KERNEL_FLAGS "-xssse3")
+		set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1")
+
+		if(CXX_HAS_AVX)
+			set(CYCLES_AVX_KERNEL_FLAGS "-xavx")
+		endif()
+
+		if(CXX_HAS_AVX2)
+			set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2")
+		endif()
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 endif()
 
 if(CXX_HAS_SSE)
@@ -154,8 +215,13 @@ if(WITH_CYCLES_OPENSUBDIV)
 	)
 endif()
 
-set(WITH_CYCLES_DEVICE_OPENCL TRUE)
-set(WITH_CYCLES_DEVICE_CUDA TRUE)
+if(WITH_CYCLES_STANDALONE)
+	set(WITH_CYCLES_DEVICE_OPENCL TRUE)
+	set(WITH_CYCLES_DEVICE_CUDA TRUE)
+	# Experimental and unfinished.
+	set(WITH_CYCLES_NETWORK FALSE)
+endif()
+# TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
 
 if(CYCLES_STANDALONE_REPOSITORY)
@@ -186,7 +252,7 @@ endif()
 # Logging capabilities using GLog library.
 if(WITH_CYCLES_LOGGING)
 	add_definitions(-DWITH_CYCLES_LOGGING)
-	add_definitions(-DGOOGLE_GLOG_DLL_DECL=)
+	add_definitions(${GLOG_DEFINES})
 	add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE})
 	include_directories(
 		SYSTEM
@@ -200,6 +266,10 @@ if(WITH_CYCLES_DEBUG)
 	add_definitions(-DWITH_CYCLES_DEBUG)
 endif()
 
+if(NOT OPENIMAGEIO_PUGIXML_FOUND)
+	add_definitions(-DWITH_SYSTEM_PUGIXML)
+endif()
+
 include_directories(
 	SYSTEM
 	${BOOST_INCLUDE_DIR}
@@ -226,6 +296,24 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 	unset(_has_no_error_unused_macros)
 endif()
 
+if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
+	if(MSVC)
+		set(MAX_MSVC 1800)
+		if(${CUDA_VERSION} EQUAL "8.0")
+			set(MAX_MSVC 1900)
+		elseif(${CUDA_VERSION} EQUAL "9.0")
+			set(MAX_MSVC 1910)
+		elseif(${CUDA_VERSION} EQUAL "9.1")
+			set(MAX_MSVC 1911)
+		endif()
+		if(NOT MSVC_VERSION LESS ${MAX_MSVC})
+			message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
+			set(WITH_CYCLES_CUBIN_COMPILER ON)
+		endif()
+		unset(MAX_MSVC)
+	endif()
+endif()
+
 
 # Subdirectories
 
@@ -238,7 +326,7 @@ if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()
 
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK)
+if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
 	add_subdirectory(app)
 endif()
 
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 8cd499b7ca6..d1f86a5fe7d 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../bvh
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../subd
-	../util
+	..
 )
 set(INC_SYS
 )
@@ -43,18 +35,15 @@ if(WITH_CYCLES_OSL)
 	list(APPEND LIBRARIES cycles_kernel_osl)
 endif()
 
-if(CYCLES_STANDALONE_REPOSITORY)
-	if(WITH_CYCLES_LOGGING)
-		list(APPEND LIBRARIES
-			${GLOG_LIBRARIES}
-			${GFLAGS_LIBRARIES}
-		)
-	endif()
-else()
+if(NOT CYCLES_STANDALONE_REPOSITORY)
 	list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc)
-	if(WITH_CYCLES_LOGGING)
-		list(APPEND LIBRARIES extern_glog extern_gflags)
-	endif()
+endif()
+
+if(WITH_CYCLES_LOGGING)
+	list(APPEND LIBRARIES
+		${GLOG_LIBRARIES}
+		${GFLAGS_LIBRARIES}
+	)
 endif()
 
 if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
@@ -131,3 +120,32 @@ if(WITH_CYCLES_NETWORK)
 	endif()
 	unset(SRC)
 endif()
+
+if(WITH_CYCLES_CUBIN_COMPILER)
+	# 32 bit windows is special, nvrtc is not supported on x86, so even
+	# though we are building 32 bit blender a 64 bit cubin_cc will have
+	# to be build to compile the cubins.
+	if(MSVC AND NOT CMAKE_CL_64)
+		message("Building with CUDA not supported on 32 bit, skipped")
+		set(WITH_CYCLES_CUDA_BINARIES OFF)
+	else()
+		set(SRC
+			cycles_cubin_cc.cpp
+		)
+		set(INC
+			../../../extern/cuew/include
+		)
+		add_executable(cycles_cubin_cc ${SRC})
+		include_directories(${INC})
+		target_link_libraries(cycles_cubin_cc
+			extern_cuew
+			${OPENIMAGEIO_LIBRARIES}
+			${PLATFORM_LINKLIBS}
+		)
+		if(NOT CYCLES_STANDALONE_REPOSITORY)
+			target_link_libraries(cycles_cubin_cc bf_intern_guardedalloc)
+		endif()
+		unset(SRC)
+		unset(INC)
+	endif()
+endif()
diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp
new file mode 100644
index 00000000000..3c83bf2dae3
--- /dev/null
+++ b/intern/cycles/app/cycles_cubin_cc.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include <OpenImageIO/argparse.h>
+#include <OpenImageIO/filesystem.h>
+
+#include "cuew.h"
+
+#ifdef _MSC_VER
+# include <Windows.h>
+#endif
+
+using std::string;
+using std::vector;
+
+namespace std {
+	template<typename T>
+	std::string to_string(const T &n) {
+		std::ostringstream s;
+		s << n;
+		return s.str();
+	}
+}
+
+class CompilationSettings
+{
+public:
+	CompilationSettings()
+	: target_arch(0),
+	  bits(64),
+	  verbose(false),
+	  fast_math(false)
+	{}
+
+	string cuda_toolkit_dir;
+	string input_file;
+	string output_file;
+	string ptx_file;
+	vector<string> defines;
+	vector<string> includes;
+	int target_arch;
+	int bits;
+	bool verbose;
+	bool fast_math;
+};
+
+bool compile_cuda(CompilationSettings &settings)
+{
+	const char* headers[] = {"stdlib.h" , "float.h", "math.h", "stdio.h"};
+	const char* header_content[] = {"\n", "\n", "\n", "\n"};
+
+	printf("Building %s\n", settings.input_file.c_str());
+
+	string code;
+	if(!OIIO::Filesystem::read_text_file(settings.input_file, code)) {
+		fprintf(stderr, "Error: unable to read %s\n", settings.input_file.c_str());
+		return false;
+	}
+
+	vector<string> options;
+	for(size_t i = 0; i < settings.includes.size(); i++) {
+		options.push_back("-I" + settings.includes[i]);
+	}
+
+	for(size_t i = 0; i < settings.defines.size(); i++) {
+		options.push_back("-D" + settings.defines[i]);
+	}
+	options.push_back("-D__KERNEL_CUDA_VERSION__=" + std::to_string(cuewNvrtcVersion()));
+	options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
+	options.push_back("--device-as-default-execution-space");
+	if(settings.fast_math)
+		options.push_back("--use_fast_math");
+
+	nvrtcProgram prog;
+	nvrtcResult result = nvrtcCreateProgram(&prog,
+		code.c_str(),                    // buffer
+		NULL,                            // name
+		sizeof(headers) / sizeof(void*), // numHeaders
+		header_content,                  // headers
+		headers);                        // includeNames
+
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcCreateProgram failed (%x)\n\n", result);
+		return false;
+	}
+
+	/* Tranfer options to a classic C array. */
+	vector<const char*> opts(options.size());
+	for(size_t i = 0; i < options.size(); i++) {
+		opts[i] = options[i].c_str();
+	}
+
+	result = nvrtcCompileProgram(prog, options.size(), &opts[0]);
+
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcCompileProgram failed (%x)\n\n", result);
+
+		size_t log_size;
+		nvrtcGetProgramLogSize(prog, &log_size);
+
+		vector<char> log(log_size);
+		nvrtcGetProgramLog(prog, &log[0]);
+		fprintf(stderr, "%s\n", &log[0]);
+
+		return false;
+	}
+
+	/* Retrieve the ptx code. */
+	size_t ptx_size;
+	result = nvrtcGetPTXSize(prog, &ptx_size);
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcGetPTXSize failed (%x)\n\n", result);
+		return false;
+	}
+
+	vector<char> ptx_code(ptx_size);
+	result = nvrtcGetPTX(prog, &ptx_code[0]);
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcGetPTX failed (%x)\n\n", result);
+		return false;
+	}
+
+	/* Write a file in the temp folder with the ptx code. */
+	settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" + OIIO::Filesystem::unique_path();
+	FILE * f= fopen(settings.ptx_file.c_str(), "wb");
+	fwrite(&ptx_code[0], 1, ptx_size, f);
+	fclose(f);
+
+	return true;
+}
+
+bool link_ptxas(CompilationSettings &settings)
+{
+	string cudapath = "";
+	if(settings.cuda_toolkit_dir.size())
+		cudapath = settings.cuda_toolkit_dir + "/bin/";
+
+	string ptx = "\"" +cudapath + "ptxas\" " + settings.ptx_file +
+					" -o " + settings.output_file +
+					" --gpu-name sm_" + std::to_string(settings.target_arch) +
+					" -m" + std::to_string(settings.bits);
+
+	if (settings.verbose)
+	{
+		ptx += " --verbose";
+		printf("%s\n", ptx.c_str());
+	}
+	
+	int pxresult = system(ptx.c_str());
+	if(pxresult) {
+		fprintf(stderr, "Error: ptxas failed (%x)\n\n", pxresult);
+		return false;
+	}
+
+	if(!OIIO::Filesystem::remove(settings.ptx_file)) {
+		fprintf(stderr, "Error: removing %s\n\n", settings.ptx_file.c_str());
+	}
+
+	return true;
+}
+
+bool init(CompilationSettings &settings)
+{
+#ifdef _MSC_VER
+	if(settings.cuda_toolkit_dir.size()) {
+		SetDllDirectory((settings.cuda_toolkit_dir + "/bin").c_str());
+	}
+#endif
+
+	int cuewresult = cuewInit(CUEW_INIT_NVRTC);
+	if(cuewresult != CUEW_SUCCESS) {
+		fprintf(stderr, "Error: cuew init fialed (0x%x)\n\n", cuewresult);
+		return false;
+	}
+
+	if(cuewNvrtcVersion() < 80) {
+		fprintf(stderr, "Error: only cuda 8 and higher is supported, %d\n\n", cuewCompilerVersion());
+		return false;
+	}
+
+	if(!nvrtcCreateProgram) {
+		fprintf(stderr, "Error: nvrtcCreateProgram not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcCompileProgram) {
+		fprintf(stderr, "Error: nvrtcCompileProgram not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetProgramLogSize) {
+		fprintf(stderr, "Error: nvrtcGetProgramLogSize not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetProgramLog) {
+		fprintf(stderr, "Error: nvrtcGetProgramLog not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetPTXSize) {
+		fprintf(stderr, "Error: nvrtcGetPTXSize not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetPTX) {
+		fprintf(stderr, "Error: nvrtcGetPTX not resolved\n");
+		return false;
+	}
+
+	return true;
+}
+
+bool parse_parameters(int argc, const char **argv, CompilationSettings &settings)
+{
+	OIIO::ArgParse ap;
+	ap.options("Usage: cycles_cubin_cc [options]",
+		"-target %d", &settings.target_arch, "target shader model",
+		"-m %d", &settings.bits, "Cuda architecture bits",
+		"-i %s", &settings.input_file, "Input source filename",
+		"-o %s", &settings.output_file, "Output cubin filename",
+		"-I %L", &settings.includes, "Add additional includepath",
+		"-D %L", &settings.defines, "Add additional defines",
+		"-v", &settings.verbose, "Use verbose logging",
+		"--use_fast_math", &settings.fast_math, "Use fast math",
+		"-cuda-toolkit-dir %s", &settings.cuda_toolkit_dir, "path to the cuda toolkit binary directory",
+		NULL);
+
+	if(ap.parse(argc, argv) < 0) {
+		fprintf(stderr, "%s\n", ap.geterror().c_str());
+		ap.usage();
+		return false;
+	}
+
+	if(!settings.output_file.size()) {
+		fprintf(stderr, "Error: Output file not set(-o), required\n\n");
+		return false;
+	}
+
+	if(!settings.input_file.size()) {
+		fprintf(stderr, "Error: Input file not set(-i, required\n\n");
+		return false;
+	}
+
+	if(!settings.target_arch) {
+		fprintf(stderr, "Error: target shader model not set (-target), required\n\n");
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, const char **argv)
+{
+	CompilationSettings settings;
+
+	if(!parse_parameters(argc, argv, settings)) {
+		fprintf(stderr, "Error: invalid parameters, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!init(settings)) {
+		fprintf(stderr, "Error: initialization error, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!compile_cuda(settings)) {
+		fprintf(stderr, "Error: compilation error, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!link_ptxas(settings)) {
+		exit(EXIT_FAILURE);
+	}
+
+	return 0;
+}
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index 4ef9cd070bb..e65b9d769e4 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -16,15 +16,15 @@
 
 #include <stdio.h>
 
-#include "device.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_task.h"
-#include "util_logging.h"
+#include "device/device.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
+#include "util/util_logging.h"
 
 using namespace ccl;
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 9816d614a7c..c682744f5fa 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,29 +16,29 @@
 
 #include <stdio.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "integrator.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_transform.h"
-#include "util_version.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/integrator.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_transform.h"
+#include "util/util_version.h"
 
 #ifdef WITH_CYCLES_STANDALONE_GUI
-#include "util_view.h"
+#include "util/util_view.h"
 #endif
 
-#include "cycles_xml.h"
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,6 +51,7 @@ struct Options {
 	SessionParams session_params;
 	bool quiet;
 	bool show_help, interactive, pause;
+	string output_path;
 } options;
 
 static void session_print(const string& str)
@@ -86,6 +87,34 @@ static void session_print_status()
 	session_print(status);
 }
 
+static bool write_render(const uchar *pixels, int w, int h, int channels)
+{
+	string msg = string_printf("Writing image %s", options.output_path.c_str());
+	session_print(msg);
+
+	ImageOutput *out = ImageOutput::create(options.output_path);
+	if(!out) {
+		return false;
+	}
+
+	ImageSpec spec(w, h, channels, TypeDesc::UINT8);
+	if(!out->open(options.output_path, spec)) {
+		return false;
+	}
+
+	/* conversion for different top/bottom convention */
+	out->write_image(TypeDesc::UINT8,
+		pixels + (h - 1) * w * channels,
+		AutoStride,
+		-w * channels,
+		AutoStride);
+
+	out->close();
+	delete out;
+
+	return true;
+}
+
 static BufferParams& session_buffer_params()
 {
 	static BufferParams buffer_params;
@@ -97,27 +126,9 @@ static BufferParams& session_buffer_params()
 	return buffer_params;
 }
 
-static void session_init()
-{
-	options.session = new Session(options.session_params);
-	options.session->reset(session_buffer_params(), options.session_params.samples);
-	options.session->scene = options.scene;
-
-	if(options.session_params.background && !options.quiet)
-		options.session->progress.set_update_callback(function_bind(&session_print_status));
-#ifdef WITH_CYCLES_STANDALONE_GUI
-	else
-		options.session->progress.set_update_callback(function_bind(&view_redraw));
-#endif
-
-	options.session->start();
-
-	options.scene = NULL;
-}
-
 static void scene_init()
 {
-	options.scene = new Scene(options.scene_params, options.session_params.device);
+	options.scene = new Scene(options.scene_params, options.session->device);
 
 	/* Read XML */
 	xml_read_file(options.scene, options.filepath.c_str());
@@ -136,16 +147,32 @@ static void scene_init()
 	options.scene->camera->compute_auto_viewplane();
 }
 
+static void session_init()
+{
+	options.session_params.write_render_cb = write_render;
+	options.session = new Session(options.session_params);
+
+	if(options.session_params.background && !options.quiet)
+		options.session->progress.set_update_callback(function_bind(&session_print_status));
+#ifdef WITH_CYCLES_STANDALONE_GUI
+	else
+		options.session->progress.set_update_callback(function_bind(&view_redraw));
+#endif
+
+	/* load scene */
+	scene_init();
+	options.session->scene = options.scene;
+
+	options.session->reset(session_buffer_params(), options.session_params.samples);
+	options.session->start();
+}
+
 static void session_exit()
 {
 	if(options.session) {
 		delete options.session;
 		options.session = NULL;
 	}
-	if(options.scene) {
-		delete options.scene;
-		options.scene = NULL;
-	}
 
 	if(options.session_params.background && !options.quiet) {
 		session_print("Finished Rendering.");
@@ -367,7 +394,7 @@ static void options_parse(int argc, const char **argv)
 		"--background", &options.session_params.background, "Render in background, without user interface",
 		"--quiet", &options.quiet, "In background mode, don't print progress messages",
 		"--samples %d", &options.session_params.samples, "Number of samples to render",
-		"--output %s", &options.session_params.output_path, "File path to write output image",
+		"--output %s", &options.output_path, "File path to write output image",
 		"--threads %d", &options.session_params.threads, "CPU Rendering Threads",
 		"--width  %d", &options.width, "Window width in pixel",
 		"--height %d", &options.height, "Window height in pixel",
@@ -430,7 +457,6 @@ static void options_parse(int argc, const char **argv)
 	/* find matching device */
 	DeviceType device_type = Device::type_from_string(devicename.c_str());
 	vector<DeviceInfo>& devices = Device::available_devices();
-	DeviceInfo device_info;
 	bool device_available = false;
 
 	foreach(DeviceInfo& device, devices) {
@@ -467,9 +493,6 @@ static void options_parse(int argc, const char **argv)
 
 	/* For smoother Viewport */
 	options.session_params.start_resolution = 64;
-
-	/* load scene */
-	scene_init();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 35a30ae683f..a46955322e3 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,31 +20,31 @@
 #include <algorithm>
 #include <iterator>
 
-#include "node_xml.h"
-
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "shader.h"
-#include "scene.h"
-
-#include "subd_patch.h"
-#include "subd_split.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_transform.h"
-#include "util_xml.h"
-
-#include "cycles_xml.h"
+#include "graph/node_xml.h"
+
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/shader.h"
+#include "render/scene.h"
+
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
+
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_projection.h"
+#include "util/util_transform.h"
+#include "util/util_xml.h"
+
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -70,9 +70,9 @@ struct XMLReadState : public XMLReader {
 
 /* Attribute Reading */
 
-static bool xml_read_int(int *value, pugi::xml_node node, const char *name)
+static bool xml_read_int(int *value, xml_node node, const char *name)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr) {
 		*value = atoi(attr.value());
@@ -82,9 +82,9 @@ static bool xml_read_int(int *value, pugi::xml_node node, const char *name)
 	return false;
 }
 
-static bool xml_read_int_array(vector<int>& value, pugi::xml_node node, const char *name)
+static bool xml_read_int_array(vector<int>& value, xml_node node, const char *name)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr) {
 		vector<string> tokens;
@@ -99,9 +99,9 @@ static bool xml_read_int_array(vector<int>& value, pugi::xml_node node, const ch
 	return false;
 }
 
-static bool xml_read_float(float *value, pugi::xml_node node, const char *name)
+static bool xml_read_float(float *value, xml_node node, const char *name)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr) {
 		*value = (float)atof(attr.value());
@@ -111,9 +111,9 @@ static bool xml_read_float(float *value, pugi::xml_node node, const char *name)
 	return false;
 }
 
-static bool xml_read_float_array(vector<float>& value, pugi::xml_node node, const char *name)
+static bool xml_read_float_array(vector<float>& value, xml_node node, const char *name)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr) {
 		vector<string> tokens;
@@ -128,7 +128,7 @@ static bool xml_read_float_array(vector<float>& value, pugi::xml_node node, cons
 	return false;
 }
 
-static bool xml_read_float3(float3 *value, pugi::xml_node node, const char *name)
+static bool xml_read_float3(float3 *value, xml_node node, const char *name)
 {
 	vector<float> array;
 
@@ -140,7 +140,7 @@ static bool xml_read_float3(float3 *value, pugi::xml_node node, const char *name
 	return false;
 }
 
-static bool xml_read_float3_array(vector<float3>& value, pugi::xml_node node, const char *name)
+static bool xml_read_float3_array(vector<float3>& value, xml_node node, const char *name)
 {
 	vector<float> array;
 
@@ -154,7 +154,7 @@ static bool xml_read_float3_array(vector<float3>& value, pugi::xml_node node, co
 	return false;
 }
 
-static bool xml_read_float4(float4 *value, pugi::xml_node node, const char *name)
+static bool xml_read_float4(float4 *value, xml_node node, const char *name)
 {
 	vector<float> array;
 
@@ -166,9 +166,9 @@ static bool xml_read_float4(float4 *value, pugi::xml_node node, const char *name
 	return false;
 }
 
-static bool xml_read_string(string *str, pugi::xml_node node, const char *name)
+static bool xml_read_string(string *str, xml_node node, const char *name)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr) {
 		*str = attr.value();
@@ -178,9 +178,9 @@ static bool xml_read_string(string *str, pugi::xml_node node, const char *name)
 	return false;
 }
 
-static bool xml_equal_string(pugi::xml_node node, const char *name, const char *value)
+static bool xml_equal_string(xml_node node, const char *name, const char *value)
 {
-	pugi::xml_attribute attr = node.attribute(name);
+	xml_attribute attr = node.attribute(name);
 
 	if(attr)
 		return string_iequals(attr.value(), value);
@@ -190,7 +190,7 @@ static bool xml_equal_string(pugi::xml_node node, const char *name, const char *
 
 /* Camera */
 
-static void xml_read_camera(XMLReadState& state, pugi::xml_node node)
+static void xml_read_camera(XMLReadState& state, xml_node node)
 {
 	Camera *cam = state.scene->camera;
 
@@ -205,12 +205,12 @@ static void xml_read_camera(XMLReadState& state, pugi::xml_node node)
 	cam->matrix = state.tfm;
 
 	cam->need_update = true;
-	cam->update();
+	cam->update(state.scene);
 }
 
 /* Shader */
 
-static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml_node graph_node)
+static void xml_read_shader_graph(XMLReadState& state, Shader *shader, xml_node graph_node)
 {
 	xml_read_node(state, shader, graph_node);
 
@@ -220,7 +220,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 	XMLReader graph_reader;
 	graph_reader.node_map[ustring("output")] = graph->output();
 
-	for(pugi::xml_node node = graph_node.first_child(); node; node = node.next_sibling()) {
+	for(xml_node node = graph_node.first_child(); node; node = node.next_sibling()) {
 		ustring node_name(node.name());
 
 		if(node_name == "connect") {
@@ -349,7 +349,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 	shader->tag_update(state.scene);
 }
 
-static void xml_read_shader(XMLReadState& state, pugi::xml_node node)
+static void xml_read_shader(XMLReadState& state, xml_node node)
 {
 	Shader *shader = new Shader();
 	xml_read_shader_graph(state, shader, node);
@@ -358,7 +358,7 @@ static void xml_read_shader(XMLReadState& state, pugi::xml_node node)
 
 /* Background */
 
-static void xml_read_background(XMLReadState& state, pugi::xml_node node)
+static void xml_read_background(XMLReadState& state, xml_node node)
 {
 	/* Background Settings */
 	xml_read_node(state, state.scene->background, node);
@@ -385,7 +385,7 @@ static Mesh *xml_add_mesh(Scene *scene, const Transform& tfm)
 	return mesh;
 }
 
-static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
+static void xml_read_mesh(const XMLReadState& state, xml_node node)
 {
 	/* add mesh */
 	Mesh *mesh = xml_add_mesh(state.scene, state.tfm);
@@ -516,7 +516,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		xml_read_float(&sdparams.dicing_rate, node, "dicing_rate");
 		sdparams.dicing_rate = std::max(0.1f, sdparams.dicing_rate);
 
-		state.scene->camera->update();
+		state.scene->camera->update(state.scene);
 		sdparams.camera = state.scene->camera;
 		sdparams.objecttoworld = state.tfm;
 	}
@@ -531,7 +531,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 
 /* Light */
 
-static void xml_read_light(XMLReadState& state, pugi::xml_node node)
+static void xml_read_light(XMLReadState& state, xml_node node)
 {
 	Light *light = new Light();
 
@@ -543,12 +543,14 @@ static void xml_read_light(XMLReadState& state, pugi::xml_node node)
 
 /* Transform */
 
-static void xml_read_transform(pugi::xml_node node, Transform& tfm)
+static void xml_read_transform(xml_node node, Transform& tfm)
 {
 	if(node.attribute("matrix")) {
 		vector<float> matrix;
-		if(xml_read_float_array(matrix, node, "matrix") && matrix.size() == 16)
-			tfm = tfm * transform_transpose((*(Transform*)&matrix[0]));
+		if(xml_read_float_array(matrix, node, "matrix") && matrix.size() == 16) {
+			ProjectionTransform projection = *(ProjectionTransform*)&matrix[0];
+			tfm = tfm * projection_to_transform(projection_transpose(projection));
+		}
 	}
 
 	if(node.attribute("translate")) {
@@ -572,7 +574,7 @@ static void xml_read_transform(pugi::xml_node node, Transform& tfm)
 
 /* State */
 
-static void xml_read_state(XMLReadState& state, pugi::xml_node node)
+static void xml_read_state(XMLReadState& state, xml_node node)
 {
 	/* read shader */
 	string shadername;
@@ -605,9 +607,9 @@ static void xml_read_state(XMLReadState& state, pugi::xml_node node)
 
 static void xml_read_include(XMLReadState& state, const string& src);
 
-static void xml_read_scene(XMLReadState& state, pugi::xml_node scene_node)
+static void xml_read_scene(XMLReadState& state, xml_node scene_node)
 {
-	for(pugi::xml_node node = scene_node.first_child(); node; node = node.next_sibling()) {
+	for(xml_node node = scene_node.first_child(); node; node = node.next_sibling()) {
 		if(string_iequals(node.name(), "film")) {
 			xml_read_node(state, state.scene->film, node);
 		}
@@ -657,8 +659,8 @@ static void xml_read_scene(XMLReadState& state, pugi::xml_node scene_node)
 static void xml_read_include(XMLReadState& state, const string& src)
 {
 	/* open XML document */
-	pugi::xml_document doc;
-	pugi::xml_parse_result parse_result;
+	xml_document doc;
+	xml_parse_result parse_result;
 
 	string path = path_join(state.base, src);
 	parse_result = doc.load_file(path.c_str());
@@ -667,7 +669,7 @@ static void xml_read_include(XMLReadState& state, const string& src)
 		XMLReadState substate = state;
 		substate.base = path_dirname(path);
 
-		pugi::xml_node cycles = doc.child("cycles");
+		xml_node cycles = doc.child("cycles");
 		xml_read_scene(substate, cycles);
 	}
 	else {
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index b57502b3b14..ae4977aaed0 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	../graph
-	../render
-	../device
-	../kernel
-	../kernel/svm
-	../util
-	../subd
+	..
 	../../glew-mx
 	../../guardedalloc
 	../../mikktspace
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 235d19e91e8..a2d6262fb20 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -102,12 +102,21 @@ class CyclesRender(bpy.types.RenderEngine):
         else:
             self.report({'ERROR'}, "OSL support disabled in this build.")
 
+    def update_render_passes(self, scene, srl):
+        engine.register_passes(self, scene, srl)
+
 
 def engine_exit():
     engine.exit()
 
 
+classes = (
+    CyclesRender,
+)
+
+
 def register():
+    from bpy.utils import register_class
     from . import ui
     from . import properties
     from . import presets
@@ -122,12 +131,15 @@ def register():
     properties.register()
     ui.register()
     presets.register()
-    bpy.utils.register_module(__name__)
+
+    for cls in classes:
+        register_class(cls)
 
     bpy.app.handlers.version_update.append(version_update.do_versions)
 
 
 def unregister():
+    from bpy.utils import unregister_class
     from . import ui
     from . import properties
     from . import presets
@@ -138,4 +150,6 @@ def unregister():
     ui.unregister()
     properties.unregister()
     presets.unregister()
-    bpy.utils.unregister_module(__name__)
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index c8c9ef58c52..1f97eff9bd0 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -50,6 +50,24 @@ def _workaround_buggy_drivers():
             _cycles.opencl_disable()
 
 
+def _configure_argument_parser():
+    import argparse
+    parser = argparse.ArgumentParser(description="Cycles Addon argument parser")
+    parser.add_argument("--cycles-resumable-num-chunks",
+                        help="Number of chunks to split sample range into",
+                        default=None)
+    parser.add_argument("--cycles-resumable-current-chunk",
+                        help="Current chunk of samples range to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-start-chunk",
+                        help="Start chunk to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-end-chunk",
+                        help="End chunk to render",
+                        default=None)
+    return parser
+
+
 def _parse_command_line():
     import sys
 
@@ -57,25 +75,22 @@ def _parse_command_line():
     if "--" not in argv:
         return
 
-    argv = argv[argv.index("--") + 1:]
-
-    num_resumable_chunks = None
-    current_resumable_chunk = None
-
-    # TODO(sergey): Add some nice error prints if argument is not used properly.
-    idx = 0
-    while idx < len(argv) - 1:
-        arg = argv[idx]
-        if arg == '--cycles-resumable-num-chunks':
-            num_resumable_chunks = int(argv[idx + 1])
-        elif arg == '--cycles-resumable-current-chunk':
-            current_resumable_chunk = int(argv[idx + 1])
-        idx += 1
+    parser = _configure_argument_parser()
+    args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if num_resumable_chunks is not None and current_resumable_chunk is not None:
-        import _cycles
-        _cycles.set_resumable_chunks(num_resumable_chunks,
-                                     current_resumable_chunk)
+    if args.cycles_resumable_num_chunks is not None:
+        if args.cycles_resumable_current_chunk is not None:
+            import _cycles
+            _cycles.set_resumable_chunk(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_current_chunk))
+        elif args.cycles_resumable_start_chunk is not None and \
+             args.cycles_resumable_end_chunk:
+            import _cycles
+            _cycles.set_resumable_chunk_range(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_start_chunk),
+                    int(args.cycles_resumable_end_chunk))
 
 
 def init():
@@ -190,3 +205,52 @@ def with_network():
 def system_info():
     import _cycles
     return _cycles.system_info()
+
+def register_passes(engine, scene, srl):
+    engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR')
+
+    if srl.use_pass_z:                     engine.register_pass(scene, srl, "Depth",         1, "Z",    'VALUE')
+    if srl.use_pass_mist:                  engine.register_pass(scene, srl, "Mist",          1, "Z",    'VALUE')
+    if srl.use_pass_normal:                engine.register_pass(scene, srl, "Normal",        3, "XYZ",  'VECTOR')
+    if srl.use_pass_vector:                engine.register_pass(scene, srl, "Vector",        4, "XYZW", 'VECTOR')
+    if srl.use_pass_uv:                    engine.register_pass(scene, srl, "UV",            3, "UVA",  'VECTOR')
+    if srl.use_pass_object_index:          engine.register_pass(scene, srl, "IndexOB",       1, "X",    'VALUE')
+    if srl.use_pass_material_index:        engine.register_pass(scene, srl, "IndexMA",       1, "X",    'VALUE')
+    if srl.use_pass_shadow:                engine.register_pass(scene, srl, "Shadow",        3, "RGB",  'COLOR')
+    if srl.use_pass_ambient_occlusion:     engine.register_pass(scene, srl, "AO",            3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_direct:        engine.register_pass(scene, srl, "DiffDir",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_indirect:      engine.register_pass(scene, srl, "DiffInd",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_color:         engine.register_pass(scene, srl, "DiffCol",       3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_direct:         engine.register_pass(scene, srl, "GlossDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_indirect:       engine.register_pass(scene, srl, "GlossInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_color:          engine.register_pass(scene, srl, "GlossCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_direct:   engine.register_pass(scene, srl, "TransDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_color:    engine.register_pass(scene, srl, "TransCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_direct:     engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_indirect:   engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_color:      engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB",  'COLOR')
+    if srl.use_pass_emit:                  engine.register_pass(scene, srl, "Emit",          3, "RGB",  'COLOR')
+    if srl.use_pass_environment:           engine.register_pass(scene, srl, "Env",           3, "RGB",  'COLOR')
+
+    crl = srl.cycles
+    if crl.pass_debug_render_time:             engine.register_pass(scene, srl, "Debug Render Time",             1, "X",   'VALUE')
+    if crl.pass_debug_bvh_traversed_nodes:     engine.register_pass(scene, srl, "Debug BVH Traversed Nodes",     1, "X",   'VALUE')
+    if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X",   'VALUE')
+    if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X",   'VALUE')
+    if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X",   'VALUE')
+    if crl.use_pass_volume_direct:             engine.register_pass(scene, srl, "VolumeDir",                     3, "RGB", 'COLOR')
+    if crl.use_pass_volume_indirect:           engine.register_pass(scene, srl, "VolumeInd",                     3, "RGB", 'COLOR')
+
+    cscene = scene.cycles
+    if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
+        engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
+        engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE')
+        engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Image",           3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index f97b51b629d..17efb00abdb 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -32,14 +32,11 @@ class AddPresetIntegrator(AddPresetBase, Operator):
 
     preset_values = [
         "cycles.max_bounces",
-        "cycles.min_bounces",
         "cycles.diffuse_bounces",
         "cycles.glossy_bounces",
         "cycles.transmission_bounces",
         "cycles.volume_bounces",
-        "cycles.transparent_min_bounces",
         "cycles.transparent_max_bounces",
-        "cycles.use_transparent_shadows",
         "cycles.caustics_reflective",
         "cycles.caustics_refractive",
         "cycles.blur_glossy"
@@ -82,12 +79,23 @@ class AddPresetSampling(AddPresetBase, Operator):
     preset_subdir = "cycles/sampling"
 
 
+classes = (
+    AddPresetIntegrator,
+    AddPresetSampling,
+)
+
+
 def register():
-    pass
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
 
 
 def unregister():
-    pass
+    from bpy.utils import unregister_class
+    for cls in classes:
+        unregister_class(cls)
+
 
 if __name__ == "__main__":
     register()
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc28..8dbd80f3747 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -42,9 +42,14 @@ enum_feature_set = (
     )
 
 enum_displacement_methods = (
-    ('BUMP', "Bump", "Bump mapping to simulate the appearance of displacement"),
-    ('TRUE', "True", "Use true displacement only, requires fine subdivision"),
-    ('BOTH', "Both", "Combination of displacement and bump mapping"),
+    ('BUMP', "Bump Only", "Bump mapping to simulate the appearance of displacement"),
+    ('DISPLACEMENT', "Displacement Only", "Use true displacement of surface only, requires fine subdivision"),
+    ('BOTH', "Displacement and Bump", "Combination of true displacement and bump mapping for finer detail"),
+    )
+
+enum_bvh_layouts = (
+    ('BVH2', "BVH2", "", 1),
+    ('BVH4', "BVH4", "", 2),
     )
 
 enum_bvh_types = (
@@ -205,13 +210,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 name="AA Samples",
                 description="Number of antialiasing samples to render for each pixel",
                 min=1, max=2097151,
-                default=4,
+                default=128,
                 )
         cls.preview_aa_samples = IntProperty(
                 name="AA Samples",
                 description="Number of antialiasing samples to render in the viewport, unlimited if 0",
                 min=0, max=2097151,
-                default=4,
+                default=32,
                 )
         cls.diffuse_samples = IntProperty(
                 name="Diffuse Samples",
@@ -308,17 +313,9 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Adaptively blur glossy shaders after blurry bounces, "
                             "to reduce noise at the cost of accuracy",
                 min=0.0, max=10.0,
-                default=0.0,
+                default=1.0,
                 )
 
-        cls.min_bounces = IntProperty(
-                name="Min Bounces",
-                description="Minimum number of bounces, setting this lower "
-                            "than the maximum enables probabilistic path "
-                            "termination (faster but noisier)",
-                min=0, max=1024,
-                default=3,
-                )
         cls.max_bounces = IntProperty(
                 name="Max Bounces",
                 description="Total maximum number of bounces",
@@ -351,26 +348,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=0,
                 )
 
-        cls.transparent_min_bounces = IntProperty(
-                name="Transparent Min Bounces",
-                description="Minimum number of transparent bounces, setting "
-                            "this lower than the maximum enables "
-                            "probabilistic path termination (faster but "
-                            "noisier)",
-                min=0, max=1024,
-                default=8,
-                )
         cls.transparent_max_bounces = IntProperty(
                 name="Transparent Max Bounces",
                 description="Maximum number of transparent bounces",
                 min=0, max=1024,
                 default=8,
                 )
-        cls.use_transparent_shadows = BoolProperty(
-                name="Transparent Shadows",
-                description="Use transparency of surfaces for rendering shadows",
-                default=True,
-                )
 
         cls.volume_step_size = FloatProperty(
                 name="Step Size",
@@ -410,6 +393,23 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=12,
                 )
 
+        cls.dicing_camera = PointerProperty(
+                name="Dicing Camera",
+                description="Camera to use as reference point when subdividing geometry, useful to avoid crawling "
+                            "artifacts in animations when the scene camera is moving",
+                type=bpy.types.Object,
+                poll=lambda self, obj: obj.type == 'CAMERA',
+                )
+        cls.offscreen_dicing_scale = FloatProperty(
+                name="Offscreen Dicing Scale",
+                description="Multiplier for dicing rate of geometry outside of the camera view. The dicing rate "
+                            "of objects is gradually increased the further they are outside the camera view. "
+                            "Lower values provide higher quality reflections and shadows for off screen objects, "
+                            "while higher values use less memory",
+                min=1.0, soft_max=25.0,
+                default=4.0,
+                )
+
         cls.film_exposure = FloatProperty(
                 name="Exposure",
                 description="Image brightness scale",
@@ -418,9 +418,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 )
         cls.film_transparent = BoolProperty(
                 name="Transparent",
-                description="World background is transparent with premultiplied alpha",
+                description="World background is transparent, for compositing the render over another background",
                 default=False,
                 )
+        cls.film_transparent_glass = BoolProperty(
+                name="Transparent Glass",
+                description="Render transmissive surfaces as transparent, for compositing glass over another background",
+                default=False,
+                )
+        cls.film_transparent_roughness = FloatProperty(
+                name="Transparent Roughness Threshold",
+                description="For transparent transmission, keep surfaces with roughness above the threshold opaque",
+                min=0.0, max=1.0,
+                default=0.1,
+                )
 
         # Really annoyingly, we have to keep it around for a few releases,
         # otherwise forward compatibility breaks in really bad manner: CRASH!
@@ -475,7 +486,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                             "higher values will be scaled down to avoid too "
                             "much noise and slow convergence at the cost of accuracy",
                 min=0.0, max=1e8,
-                default=0.0,
+                default=10.0,
                 )
 
         cls.debug_tile_size = IntProperty(
@@ -560,6 +571,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 ('SHADOW', "Shadow", ""),
                 ('NORMAL', "Normal", ""),
                 ('UV', "UV", ""),
+                ('ROUGHNESS', "Roughness", ""),
                 ('EMIT', "Emit", ""),
                 ('ENVIRONMENT', "Environment", ""),
                 ('DIFFUSE', "Diffuse", ""),
@@ -664,9 +676,15 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_use_cpu_sse41 = BoolProperty(name="SSE41", default=True)
         cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
         cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
-        cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+        cls.debug_bvh_layout = EnumProperty(
+                name="BVH Layout",
+                items=enum_bvh_layouts,
+                default='BVH4',
+                )
+        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_opencl_kernel_type = EnumProperty(
             name="OpenCL Kernel Type",
@@ -693,8 +711,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             update=devices_update_callback
             )
 
+        cls.debug_opencl_kernel_single_program = BoolProperty(
+            name="Single Program",
+            default=True,
+            update=devices_update_callback,
+            )
+
         cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
 
+        cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0,
+            description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)")
+
     @classmethod
     def unregister(cls):
         del bpy.types.Scene.cycles
@@ -852,7 +879,7 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
                 name="Displacement Method",
                 description="Method to use for the displacement",
                 items=enum_displacement_methods,
-                default='BUMP',
+                default='DISPLACEMENT',
                 )
 
     @classmethod
@@ -1062,7 +1089,7 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
 
         cls.motion_steps = IntProperty(
                 name="Motion Steps",
-                description="Control accuracy of deformation motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
+                description="Control accuracy of motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
                 min=1, soft_max=8,
                 default=1,
                 )
@@ -1092,6 +1119,21 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
                 default=1.0,
                 )
 
+        cls.is_shadow_catcher = BoolProperty(
+                name="Shadow Catcher",
+                description="Only render shadows on this object, for compositing renders into real footage",
+                default=False,
+                )
+
+        cls.is_holdout = BoolProperty(
+                name="Holdout",
+                description="Render objects as a holdout or matte, creating a "
+                            "hole in the image with zero alpha, to fill out in "
+                            "compositing with real footange or another render",
+                default=False,
+                )
+
+
     @classmethod
     def unregister(cls):
         del bpy.types.Object.cycles
@@ -1156,6 +1198,143 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
     def unregister(cls):
         del bpy.types.Scene.cycles_curves
 
+def update_render_passes(self, context):
+    scene = context.scene
+    rd = scene.render
+    rl = rd.layers.active
+    rl.update_render_passes()
+
+class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
+    @classmethod
+    def register(cls):
+        bpy.types.SceneRenderLayer.cycles = PointerProperty(
+                name="Cycles SceneRenderLayer Settings",
+                description="Cycles SceneRenderLayer Settings",
+                type=cls,
+                )
+        cls.pass_debug_bvh_traversed_nodes = BoolProperty(
+                name="Debug BVH Traversed Nodes",
+                description="Store Debug BVH Traversed Nodes pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_bvh_traversed_instances = BoolProperty(
+                name="Debug BVH Traversed Instances",
+                description="Store Debug BVH Traversed Instances pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_bvh_intersections = BoolProperty(
+                name="Debug BVH Intersections",
+                description="Store Debug BVH Intersections",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_ray_bounces = BoolProperty(
+                name="Debug Ray Bounces",
+                description="Store Debug Ray Bounces pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_render_time = BoolProperty(
+                name="Debug Render Time",
+                description="Render time in milliseconds per sample and pixel",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.use_pass_volume_direct = BoolProperty(
+                name="Volume Direct",
+                description="Deliver direct volumetric scattering pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.use_pass_volume_indirect = BoolProperty(
+                name="Volume Indirect",
+                description="Deliver indirect volumetric scattering pass",
+                default=False,
+                update=update_render_passes,
+                )
+
+        cls.use_denoising = BoolProperty(
+                name="Use Denoising",
+                description="Denoise the rendered image",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.denoising_diffuse_direct = BoolProperty(
+                name="Diffuse Direct",
+                description="Denoise the direct diffuse lighting",
+                default=True,
+                )
+        cls.denoising_diffuse_indirect = BoolProperty(
+                name="Diffuse Indirect",
+                description="Denoise the indirect diffuse lighting",
+                default=True,
+                )
+        cls.denoising_glossy_direct = BoolProperty(
+                name="Glossy Direct",
+                description="Denoise the direct glossy lighting",
+                default=True,
+                )
+        cls.denoising_glossy_indirect = BoolProperty(
+                name="Glossy Indirect",
+                description="Denoise the indirect glossy lighting",
+                default=True,
+                )
+        cls.denoising_transmission_direct = BoolProperty(
+                name="Transmission Direct",
+                description="Denoise the direct transmission lighting",
+                default=True,
+                )
+        cls.denoising_transmission_indirect = BoolProperty(
+                name="Transmission Indirect",
+                description="Denoise the indirect transmission lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_direct = BoolProperty(
+                name="Subsurface Direct",
+                description="Denoise the direct subsurface lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_indirect = BoolProperty(
+                name="Subsurface Indirect",
+                description="Denoise the indirect subsurface lighting",
+                default=True,
+                )
+        cls.denoising_strength = FloatProperty(
+                name="Denoising Strength",
+                description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_feature_strength = FloatProperty(
+                name="Denoising Feature Strength",
+                description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_radius = IntProperty(
+                name="Denoising Radius",
+                description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+                min=1, max=25,
+                default=8,
+        )
+        cls.denoising_relative_pca = BoolProperty(
+                name="Relative filter",
+                description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+                default=False,
+        )
+        cls.denoising_store_passes = BoolProperty(
+                name="Store denoising passes",
+                description="Store the denoising feature passes and the noisy image",
+                default=False,
+                update=update_render_passes,
+        )
+
+    @classmethod
+    def unregister(cls):
+        del bpy.types.SceneRenderLayer.cycles
+
 
 class CyclesCurveSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -1230,35 +1409,54 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     devices = bpy.props.CollectionProperty(type=CyclesDeviceSettings)
 
-    def get_devices(self):
-        import _cycles
-        # Layout of the device tuples: (Name, Type, Persistent ID)
-        device_list = _cycles.available_devices()
+    def find_existing_device_entry(self, device):
+        for device_entry in self.devices:
+            if device_entry.id == device[2] and device_entry.type == device[1]:
+                return device_entry
+        return None
 
-        cuda_devices = []
-        opencl_devices = []
+
+    def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPENCL'}:
+            if not device[1] in {'CUDA', 'OPENCL', 'CPU'}:
                 continue
-
-            entry = None
             # Try to find existing Device entry
-            for dev in self.devices:
-                if dev.id == device[2] and dev.type == device[1]:
-                    entry = dev
-                    break
-            # Create new entry if no existing one was found
+            entry = self.find_existing_device_entry(device)
             if not entry:
+                # Create new entry if no existing one was found
                 entry = self.devices.add()
                 entry.id   = device[2]
                 entry.name = device[0]
                 entry.type = device[1]
+                entry.use  = entry.type != 'CPU'
+            elif entry.name != device[0]:
+                # Update name in case it changed
+                entry.name = device[0]
+
 
-            # Sort entries into lists
+    def get_devices(self):
+        import _cycles
+        # Layout of the device tuples: (Name, Type, Persistent ID)
+        device_list = _cycles.available_devices()
+        # Make sure device entries are up to date and not referenced before
+        # we know we don't add new devices. This way we guarantee to not
+        # hold pointers to a resized array.
+        self.update_device_entries(device_list)
+        # Sort entries into lists
+        cuda_devices = []
+        opencl_devices = []
+        cpu_devices = []
+        for device in device_list:
+            entry = self.find_existing_device_entry(device)
             if entry.type == 'CUDA':
                 cuda_devices.append(entry)
             elif entry.type == 'OPENCL':
                 opencl_devices.append(entry)
+            elif entry.type == 'CPU':
+                cpu_devices.append(entry)
+        # Extend all GPU devices with CPU.
+        cuda_devices.extend(cpu_devices)
+        opencl_devices.extend(cpu_devices)
         return cuda_devices, opencl_devices
 
 
@@ -1287,14 +1485,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         row = layout.row()
 
         if self.compute_device_type == 'CUDA' and cuda_devices:
-            col = row.column(align=True)
+            box = row.box()
             for device in cuda_devices:
-                col.prop(device, "use", text=device.name, toggle=True)
+                box.prop(device, "use", text=device.name)
 
         if self.compute_device_type == 'OPENCL' and opencl_devices:
-            col = row.column(align=True)
+            box = row.box()
             for device in opencl_devices:
-                col.prop(device, "use", text=device.name, toggle=True)
+                box.prop(device, "use", text=device.name)
 
 
     def draw(self, context):
@@ -1314,6 +1512,7 @@ def register():
     bpy.utils.register_class(CyclesCurveSettings)
     bpy.utils.register_class(CyclesDeviceSettings)
     bpy.utils.register_class(CyclesPreferences)
+    bpy.utils.register_class(CyclesRenderLayerSettings)
 
 
 def unregister():
@@ -1329,3 +1528,4 @@ def unregister():
     bpy.utils.unregister_class(CyclesCurveSettings)
     bpy.utils.unregister_class(CyclesDeviceSettings)
     bpy.utils.unregister_class(CyclesPreferences)
+    bpy.utils.unregister_class(CyclesRenderLayerSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 44af5f7efed..707f8756f6f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -86,12 +86,10 @@ def use_sample_all_lights(context):
 
     return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect
 
-def show_device_selection(context):
-    type = get_device_type(context)
-    if type == 'NETWORK':
+def show_device_active(context):
+    cscene = context.scene.cycles
+    if cscene.device != 'GPU':
         return True
-    if not type in {'CUDA', 'OPENCL'}:
-        return False
     return context.user_preferences.addons[__package__].preferences.has_active_device()
 
 
@@ -141,7 +139,7 @@ def draw_samples_info(layout, context):
                       (ao * aa, ml * aa, sss * aa, vol * aa))
 
 
-class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
     bl_label = "Sampling"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -158,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
@@ -186,9 +183,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.label(text="AA Samples:")
             sub.prop(cscene, "aa_samples", text="Render")
             sub.prop(cscene, "preview_aa_samples", text="Preview")
-            sub.separator()
-            sub.prop(cscene, "sample_all_lights_direct")
-            sub.prop(cscene, "sample_all_lights_indirect")
 
             col = split.column()
             sub = col.column(align=True)
@@ -205,8 +199,11 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
             sub.prop(cscene, "volume_samples", text="Volume")
 
-        if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
-            layout.row().prop(cscene, "sampling_pattern", text="Pattern")
+            col = layout.column(align=True)
+            col.prop(cscene, "sample_all_lights_direct")
+            col.prop(cscene, "sample_all_lights_indirect")
+
+        layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
         for rl in scene.render.layers:
             if rl.samples > 0:
@@ -217,7 +214,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         draw_samples_info(layout, context)
 
 
-class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_geometry(CyclesButtonsPanel, Panel):
     bl_label = "Geometry"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -228,31 +225,32 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
         cscene = scene.cycles
         ccscene = scene.cycles_curves
 
+        row = layout.row()
+        row.label("Volume Sampling:")
+        row = layout.row()
+        row.prop(cscene, "volume_step_size")
+        row.prop(cscene, "volume_max_steps")
+
+        layout.separator()
+
         if cscene.feature_set == 'EXPERIMENTAL':
+            layout.label("Subdivision Rate:")
             split = layout.split()
 
             col = split.column()
-
             sub = col.column(align=True)
-            sub.label("Volume Sampling:")
-            sub.prop(cscene, "volume_step_size")
-            sub.prop(cscene, "volume_max_steps")
+            sub.prop(cscene, "dicing_rate", text="Render")
+            sub.prop(cscene, "preview_dicing_rate", text="Preview")
 
             col = split.column()
+            col.prop(cscene, "offscreen_dicing_scale", text="Offscreen Scale")
+            col.prop(cscene, "max_subdivisions")
 
-            sub = col.column(align=True)
-            sub.label("Subdivision Rate:")
-            sub.prop(cscene, "dicing_rate", text="Render")
-            sub.prop(cscene, "preview_dicing_rate", text="Preview")
-            sub.separator()
-            sub.prop(cscene, "max_subdivisions")
-        else:
-            row = layout.row()
-            row.label("Volume Sampling:")
-            row = layout.row()
-            row.prop(cscene, "volume_step_size")
-            row.prop(cscene, "volume_max_steps")
+            layout.prop(cscene, "dicing_camera")
+
+            layout.separator()
 
+        layout.label("Hair:")
         layout.prop(ccscene, "use_curves", text="Use Hair")
         col = layout.column()
         col.active = ccscene.use_curves
@@ -270,10 +268,10 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
 
         row = col.row()
         row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
+        row.prop(ccscene, "maximum_width", text="Max Extension")
 
 
-class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel):
     bl_label = "Light Paths"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -295,8 +293,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label("Transparency:")
         sub.prop(cscene, "transparent_max_bounces", text="Max")
-        sub.prop(cscene, "transparent_min_bounces", text="Min")
-        sub.prop(cscene, "use_transparent_shadows", text="Shadows")
 
         col.separator()
 
@@ -309,7 +305,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label(text="Bounces:")
         sub.prop(cscene, "max_bounces", text="Max")
-        sub.prop(cscene, "min_bounces", text="Min")
 
         sub = col.column(align=True)
         sub.prop(cscene, "diffuse_bounces", text="Diffuse")
@@ -318,7 +313,7 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "volume_bounces", text="Volume")
 
 
-class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_motion_blur(CyclesButtonsPanel, Panel):
     bl_label = "Motion Blur"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -359,7 +354,7 @@ class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel):
         row.prop(cscene, "rolling_shutter_duration")
 
 
-class CyclesRender_PT_film(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_film(CyclesButtonsPanel, Panel):
     bl_label = "Film"
 
     def draw(self, context):
@@ -372,16 +367,23 @@ class CyclesRender_PT_film(CyclesButtonsPanel, Panel):
 
         col = split.column()
         col.prop(cscene, "film_exposure")
-        col.prop(cscene, "film_transparent")
-
-        col = split.column()
+        col.separator()
         sub = col.column(align=True)
         sub.prop(cscene, "pixel_filter_type", text="")
         if cscene.pixel_filter_type != 'BOX':
             sub.prop(cscene, "filter_width", text="Width")
 
+        col = split.column()
+        col.prop(cscene, "film_transparent")
+        sub = col.row()
+        sub.prop(cscene, "film_transparent_glass", text="Transparent Glass")
+        sub.active = cscene.film_transparent
+        sub = col.row()
+        sub.prop(cscene, "film_transparent_roughness", text="Roughness Threshold")
+        sub.active = cscene.film_transparent and cscene.film_transparent_glass
+
 
-class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel):
     bl_label = "Performance"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -402,6 +404,8 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         sub.enabled = rd.threads_mode == 'FIXED'
         sub.prop(rd, "threads")
 
+        col.separator()
+
         sub = col.column(align=True)
         sub.label(text="Tiles:")
         sub.prop(cscene, "tile_order", text="")
@@ -409,21 +413,17 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         sub.prop(rd, "tile_x", text="X")
         sub.prop(rd, "tile_y", text="Y")
 
-        sub.prop(cscene, "use_progressive_refine")
-
-        subsub = sub.column(align=True)
-        subsub.prop(rd, "use_save_buffers")
-
-        col = split.column(align=True)
-
-        col.label(text="Viewport:")
-        col.prop(cscene, "debug_bvh_type", text="")
-        col.separator()
-        col.prop(cscene, "preview_start_resolution")
+        subsub = sub.column()
+        subsub.active = not rd.use_save_buffers
+        for rl in rd.layers:
+            if rl.cycles.use_denoising:
+                subsub.active = False
+        subsub.prop(cscene, "use_progressive_refine")
 
-        col.separator()
+        col = split.column()
 
         col.label(text="Final Render:")
+        col.prop(rd, "use_save_buffers")
         col.prop(rd, "use_persistent_data", text="Persistent Images")
 
         col.separator()
@@ -436,13 +436,20 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         row.active = not cscene.debug_use_spatial_splits
         row.prop(cscene, "debug_bvh_time_steps")
 
+        col = layout.column()
+        col.label(text="Viewport Resolution:")
+        split = col.split()
+        split.prop(rd, "preview_pixel_size", text="")
+        split.prop(cscene, "preview_start_resolution")
+
 
-class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_layer_options(CyclesButtonsPanel, Panel):
     bl_label = "Layer"
     bl_context = "render_layer"
 
     def draw(self, context):
         layout = self.layout
+        with_freestyle = bpy.app.build_options.freestyle
 
         scene = context.scene
         rd = scene.render
@@ -471,19 +478,26 @@ class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_ao", "Use AO")
         col.prop(rl, "use_solid", "Use Surfaces")
         col.prop(rl, "use_strand", "Use Hair")
+        if with_freestyle:
+            row = col.row()
+            row.prop(rl, "use_freestyle", "Use Freestyle")
+            row.active = rd.use_freestyle
 
 
-class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
     bl_label = "Passes"
     bl_context = "render_layer"
     bl_options = {'DEFAULT_CLOSED'}
 
     def draw(self, context):
+        import _cycles
+
         layout = self.layout
 
         scene = context.scene
         rd = scene.render
         rl = rd.layers.active
+        crl = rl.cycles
 
         split = layout.split()
 
@@ -525,16 +539,31 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         row.prop(rl, "use_pass_subsurface_direct", text="Direct", toggle=True)
         row.prop(rl, "use_pass_subsurface_indirect", text="Indirect", toggle=True)
         row.prop(rl, "use_pass_subsurface_color", text="Color", toggle=True)
+        col.label(text="Volume:")
+        row = col.row(align=True)
+        row.prop(crl, "use_pass_volume_direct", text="Direct", toggle=True)
+        row.prop(crl, "use_pass_volume_indirect", text="Indirect", toggle=True)
 
         col.separator()
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
-        if hasattr(rd, "debug_pass_type"):
-            layout.prop(rd, "debug_pass_type")
+        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+            col.separator()
+            sub = col.column()
+            sub.active = crl.use_denoising
+            sub.prop(crl, "denoising_store_passes", text="Denoising")
+
+        col = layout.column()
+        col.prop(crl, "pass_debug_render_time")
+        if _cycles.with_cycles_debug:
+            col.prop(crl, "pass_debug_bvh_traversed_nodes")
+            col.prop(crl, "pass_debug_bvh_traversed_instances")
+            col.prop(crl, "pass_debug_bvh_intersections")
+            col.prop(crl, "pass_debug_ray_bounces")
 
 
-class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel):
     bl_label = "Views"
     bl_context = "render_layer"
     bl_options = {'DEFAULT_CLOSED'}
@@ -577,7 +606,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
             row.prop(rv, "camera_suffix", text="")
 
 
-class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_context = "render_layer"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+        cscene = context.scene.cycles
+        layout = self.layout
+
+        layout.prop(crl, "use_denoising", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        cscene = scene.cycles
+        rd = scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+
+        layout.active = crl.use_denoising
+
+        split = layout.split()
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_radius", text="Radius")
+        sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+        sub.prop(crl, "denoising_relative_pca")
+
+        layout.separator()
+
+        row = layout.row()
+        row.label(text="Diffuse:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Glossy:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Transmission:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Subsurface:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
+class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -596,7 +689,7 @@ class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
         col.prop(rd, "dither_intensity", text="Dither", slider=True)
 
 
-class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
+class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel):
     bl_label = "Depth of Field"
     bl_context = "data"
 
@@ -647,7 +740,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
         sub.prop(ccam, "aperture_ratio", text="Ratio")
 
 
-class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
+class CYCLES_PT_context_material(CyclesButtonsPanel, Panel):
     bl_label = ""
     bl_context = "material"
     bl_options = {'HIDE_HEADER'}
@@ -707,7 +800,7 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
             split.separator()
 
 
-class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
+class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
     bl_label = "Motion Blur"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
@@ -716,7 +809,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         ob = context.object
         if CyclesButtonsPanel.poll(context) and ob:
-            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}:
+            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META', 'CAMERA'}:
                 return True
             if ob.dupli_type == 'GROUP' and ob.dupli_group:
                 return True
@@ -748,14 +841,12 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
         layout.active = (rd.use_motion_blur and cob.use_motion_blur)
 
         row = layout.row()
-        row.prop(cob, "use_deform_motion", text="Deformation")
-
-        sub = row.row()
-        sub.active = cob.use_deform_motion
-        sub.prop(cob, "motion_steps", text="Steps")
+        if ob.type != 'CAMERA':
+            row.prop(cob, "use_deform_motion", text="Deformation")
+        row.prop(cob, "motion_steps", text="Steps")
 
 
-class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
+class CYCLES_OBJECT_PT_cycles_settings(CyclesButtonsPanel, Panel):
     bl_label = "Cycles Settings"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
@@ -788,6 +879,10 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
         if ob.type != 'LAMP':
             flow.prop(visibility, "shadow")
 
+        row = layout.row()
+        row.prop(cob, "is_shadow_catcher")
+        row.prop(cob, "is_holdout")
+
         col = layout.column()
         col.label(text="Performance:")
         row = col.row()
@@ -862,7 +957,7 @@ def panel_node_draw(layout, id_data, output_type, input_name):
     return True
 
 
-class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "data"
     bl_options = {'DEFAULT_CLOSED'}
@@ -878,7 +973,7 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.lamp)
 
 
-class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_lamp(CyclesButtonsPanel, Panel):
     bl_label = "Lamp"
     bl_context = "data"
 
@@ -932,7 +1027,7 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
             layout.label(text="Not supported, interpreted as sun lamp")
 
 
-class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_nodes(CyclesButtonsPanel, Panel):
     bl_label = "Nodes"
     bl_context = "data"
 
@@ -950,7 +1045,7 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
             layout.prop(lamp, "color")
 
 
-class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_spot(CyclesButtonsPanel, Panel):
     bl_label = "Spot Shape"
     bl_context = "data"
 
@@ -975,7 +1070,7 @@ class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel):
         col.prop(lamp, "show_cone")
 
 
-class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -988,7 +1083,7 @@ class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.world)
 
 
-class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_surface(CyclesButtonsPanel, Panel):
     bl_label = "Surface"
     bl_context = "world"
 
@@ -1005,7 +1100,7 @@ class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel):
             layout.prop(world, "horizon_color", text="Color")
 
 
-class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
     bl_label = "Volume"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1022,7 +1117,7 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
 
-class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
     bl_label = "Ambient Occlusion"
     bl_context = "world"
 
@@ -1047,7 +1142,7 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
         row.prop(light, "distance", text="Distance")
 
 
-class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
     bl_label = "Mist Pass"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1074,7 +1169,7 @@ class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel):
         layout.prop(world.mist_settings, "falloff")
 
 
-class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel):
     bl_label = "Ray Visibility"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1098,7 +1193,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel):
         flow.prop(visibility, "scatter")
 
 
-class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel):
     bl_label = "Settings"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1135,11 +1230,11 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cworld, "volume_sampling", text="")
-        sub.prop(cworld, "volume_interpolation", text="")
+        col.prop(cworld, "volume_interpolation", text="")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
 
 
-class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "material"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1152,7 +1247,7 @@ class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.material)
 
 
-class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_surface(CyclesButtonsPanel, Panel):
     bl_label = "Surface"
     bl_context = "material"
 
@@ -1168,7 +1263,7 @@ class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel):
             layout.prop(mat, "diffuse_color")
 
 
-class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_volume(CyclesButtonsPanel, Panel):
     bl_label = "Volume"
     bl_context = "material"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1187,7 +1282,7 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume')
 
 
-class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_displacement(CyclesButtonsPanel, Panel):
     bl_label = "Displacement"
     bl_context = "material"
 
@@ -1203,10 +1298,9 @@ class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Displacement')
 
 
-class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_settings(CyclesButtonsPanel, Panel):
     bl_label = "Settings"
     bl_context = "material"
-    bl_options = {'DEFAULT_CLOSED'}
 
     @classmethod
     def poll(cls, context):
@@ -1224,41 +1318,53 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         col.prop(cmat, "sample_as_light", text="Multiple Importance")
         col.prop(cmat, "use_transparent_shadow")
 
-        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
-            col.separator()
-            col.label(text="Displacement:")
-            col.prop(cmat, "displacement_method", text="")
+        col.separator()
+        col.label(text="Geometry:")
+        col.prop(cmat, "displacement_method", text="")
 
         col = split.column()
         col.label(text="Volume:")
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cmat, "volume_sampling", text="")
-        sub.prop(cmat, "volume_interpolation", text="")
+        col.prop(cmat, "volume_interpolation", text="")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
 
-        layout.separator()
+        col.separator()
+        col.prop(mat, "pass_index")
+
+
+class CYCLES_MATERIAL_PT_viewport(CyclesButtonsPanel, Panel):
+    bl_label = "Viewport"
+    bl_context = "material"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    @classmethod
+    def poll(cls, context):
+        return context.material and CyclesButtonsPanel.poll(context)
+
+    def draw(self, context):
+        mat = context.material
+
+        layout = self.layout
         split = layout.split()
 
         col = split.column(align=True)
-        col.label("Viewport Color:")
+        col.label("Color:")
         col.prop(mat, "diffuse_color", text="")
         col.prop(mat, "alpha")
 
         col.separator()
-        col.label("Viewport Alpha:")
+        col.label("Alpha:")
         col.prop(mat.game_settings, "alpha_blend", text="")
 
         col = split.column(align=True)
-        col.label("Viewport Specular:")
+        col.label("Specular:")
         col.prop(mat, "specular_color", text="")
         col.prop(mat, "specular_hardness", text="Hardness")
 
-        col.separator()
-        col.prop(mat, "pass_index")
 
-
-class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_context(CyclesButtonsPanel, Panel):
     bl_label = ""
     bl_context = "texture"
     bl_options = {'HIDE_HEADER'}
@@ -1299,7 +1405,7 @@ class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
                 split.prop(tex, "type", text="")
 
 
-class CyclesTexture_PT_node(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_node(CyclesButtonsPanel, Panel):
     bl_label = "Node"
     bl_context = "texture"
 
@@ -1316,7 +1422,7 @@ class CyclesTexture_PT_node(CyclesButtonsPanel, Panel):
         layout.template_node_view(ntree, node, None)
 
 
-class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_mapping(CyclesButtonsPanel, Panel):
     bl_label = "Mapping"
     bl_context = "texture"
 
@@ -1349,7 +1455,7 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
         row.prop(mapping, "mapping_z", text="")
 
 
-class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_colors(CyclesButtonsPanel, Panel):
     bl_label = "Color"
     bl_context = "texture"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1388,7 +1494,7 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
             layout.template_color_ramp(mapping, "color_ramp", expand=True)
 
 
-class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
+class CYCLES_PARTICLE_PT_textures(CyclesButtonsPanel, Panel):
     bl_label = "Textures"
     bl_context = "particle"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1419,7 +1525,7 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
             layout.template_ID(slot, "texture", new="texture.new")
 
 
-class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
     bl_label = "Bake"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1492,7 +1598,7 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
             sub.prop(cbk, "cage_extrusion", text="Ray Distance")
 
 
-class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
     bl_label = "Debug"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1517,20 +1623,33 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_sse41", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
-        col.prop(cscene, "debug_use_qbvh")
+        col.prop(cscene, "debug_bvh_layout")
+        col.prop(cscene, "debug_use_cpu_split_kernel")
+
+        col.separator()
 
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")
+
+        col.separator()
 
         col = layout.column()
         col.label('OpenCL Flags:')
         col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
         col.prop(cscene, "debug_opencl_device_type", text="Device")
+        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
+        col.prop(cscene, "debug_opencl_mem_limit")
+
+        col.separator()
+
+        col = layout.column()
+        col.prop(cscene, "debug_bvh_type")
 
 
-class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
+class CYCLES_PARTICLE_PT_curve_settings(CyclesButtonsPanel, Panel):
     bl_label = "Cycles Hair Settings"
     bl_context = "particle"
 
@@ -1561,7 +1680,7 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
         row.prop(cpsys, "use_closetip", text="Close tip")
 
 
-class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
+class CYCLES_SCENE_PT_simplify(CyclesButtonsPanel, Panel):
     bl_label = "Simplify"
     bl_context = "scene"
     COMPAT_ENGINES = {'CYCLES'}
@@ -1630,10 +1749,10 @@ def draw_device(self, context):
 
         layout.prop(cscene, "feature_set")
 
-        split = layout.split(percentage=1/3)
+        split = layout.split(percentage=1 / 3)
         split.label("Device:")
         row = split.row()
-        row.active = show_device_selection(context)
+        row.active = show_device_active(context)
         row.prop(cscene, "device", text="")
 
         if engine.with_osl() and use_cpu(context):
@@ -1712,17 +1831,77 @@ def get_panels():
 
     return panels
 
+
+classes = (
+    CYCLES_MT_sampling_presets,
+    CYCLES_MT_integrator_presets,
+    CYCLES_RENDER_PT_sampling,
+    CYCLES_RENDER_PT_geometry,
+    CYCLES_RENDER_PT_light_paths,
+    CYCLES_RENDER_PT_motion_blur,
+    CYCLES_RENDER_PT_film,
+    CYCLES_RENDER_PT_performance,
+    CYCLES_RENDER_PT_layer_options,
+    CYCLES_RENDER_PT_layer_passes,
+    CYCLES_RENDER_PT_views,
+    CYCLES_RENDER_PT_denoising,
+    CYCLES_PT_post_processing,
+    CYCLES_CAMERA_PT_dof,
+    CYCLES_PT_context_material,
+    CYCLES_OBJECT_PT_motion_blur,
+    CYCLES_OBJECT_PT_cycles_settings,
+    CYCLES_OT_use_shading_nodes,
+    CYCLES_LAMP_PT_preview,
+    CYCLES_LAMP_PT_lamp,
+    CYCLES_LAMP_PT_nodes,
+    CYCLES_LAMP_PT_spot,
+    CYCLES_WORLD_PT_preview,
+    CYCLES_WORLD_PT_surface,
+    CYCLES_WORLD_PT_volume,
+    CYCLES_WORLD_PT_ambient_occlusion,
+    CYCLES_WORLD_PT_mist,
+    CYCLES_WORLD_PT_ray_visibility,
+    CYCLES_WORLD_PT_settings,
+    CYCLES_MATERIAL_PT_preview,
+    CYCLES_MATERIAL_PT_surface,
+    CYCLES_MATERIAL_PT_volume,
+    CYCLES_MATERIAL_PT_displacement,
+    CYCLES_MATERIAL_PT_settings,
+    CYCLES_MATERIAL_PT_viewport,
+    CYCLES_TEXTURE_PT_context,
+    CYCLES_TEXTURE_PT_node,
+    CYCLES_TEXTURE_PT_mapping,
+    CYCLES_TEXTURE_PT_colors,
+    CYCLES_PARTICLE_PT_textures,
+    CYCLES_RENDER_PT_bake,
+    CYCLES_RENDER_PT_debug,
+    CYCLES_PARTICLE_PT_curve_settings,
+    CYCLES_SCENE_PT_simplify,
+)
+
+
 def register():
+    from bpy.utils import register_class
+
     bpy.types.RENDER_PT_render.append(draw_device)
     bpy.types.VIEW3D_HT_header.append(draw_pause)
 
     for panel in get_panels():
         panel.COMPAT_ENGINES.add('CYCLES')
 
+    for cls in classes:
+        register_class(cls)
+
+
 def unregister():
+    from bpy.utils import unregister_class
+
     bpy.types.RENDER_PT_render.remove(draw_device)
     bpy.types.VIEW3D_HT_header.remove(draw_pause)
 
     for panel in get_panels():
         if 'CYCLES' in panel.COMPAT_ENGINES:
             panel.COMPAT_ENGINES.remove('CYCLES')
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index b2a745500a1..292f0a1fa90 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -17,6 +17,7 @@
 # <pep8 compliant>
 
 import bpy
+import math
 
 from bpy.app.handlers import persistent
 
@@ -89,6 +90,106 @@ def foreach_cycles_node(callback):
                                     traversed)
 
 
+def displacement_node_insert(material, nodetree, traversed):
+    if nodetree in traversed:
+        return
+    traversed.add(nodetree)
+
+    for node in nodetree.nodes:
+        if node.bl_idname == 'ShaderNodeGroup':
+            displacement_node_insert(material, node.node_tree, traversed)
+
+    # Gather links to replace
+    displacement_links = []
+    for link in nodetree.links:
+        if link.to_node.bl_idname == 'ShaderNodeOutputMaterial' and \
+           link.from_node.bl_idname != 'ShaderNodeDisplacement' and \
+           link.to_socket.identifier == 'Displacement':
+           displacement_links.append(link)
+
+    # Replace links with displacement node
+    for link in displacement_links:
+        from_node = link.from_node
+        from_socket = link.from_socket
+        to_node = link.to_node
+        to_socket = link.to_socket
+
+        nodetree.links.remove(link)
+
+        node = nodetree.nodes.new(type='ShaderNodeDisplacement')
+        node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0]);
+        node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1]);
+        node.inputs['Scale'].default_value = 0.1
+        node.inputs['Midlevel'].default_value = 0.0
+
+        nodetree.links.new(from_socket, node.inputs['Height'])
+        nodetree.links.new(node.outputs['Displacement'], to_socket)
+
+def displacement_nodes_insert():
+    traversed = set()
+    for material in bpy.data.materials:
+        if check_is_new_shading_material(material):
+            displacement_node_insert(material, material.node_tree, traversed)
+
+def displacement_principled_nodes(node):
+    if node.bl_idname == 'ShaderNodeDisplacement':
+        if node.space != 'WORLD':
+            node.space = 'OBJECT'
+    if node.bl_idname == 'ShaderNodeBsdfPrincipled':
+        if node.subsurface_method != 'RANDOM_WALK':
+            node.subsurface_method = 'BURLEY'
+
+def square_roughness_node_insert(material, nodetree, traversed):
+    if nodetree in traversed:
+        return
+    traversed.add(nodetree)
+
+    roughness_node_types = {
+        'ShaderNodeBsdfAnisotropic',
+        'ShaderNodeBsdfGlass',
+        'ShaderNodeBsdfGlossy',
+        'ShaderNodeBsdfRefraction'}
+
+    # Update default values
+    for node in nodetree.nodes:
+        if node.bl_idname == 'ShaderNodeGroup':
+            square_roughness_node_insert(material, node.node_tree, traversed)
+        elif node.bl_idname in roughness_node_types:
+            roughness_input = node.inputs['Roughness']
+            roughness_input.default_value = math.sqrt(max(roughness_input.default_value, 0.0))
+
+    # Gather roughness links to replace
+    roughness_links = []
+    for link in nodetree.links:
+        if link.to_node.bl_idname in roughness_node_types and \
+           link.to_socket.identifier == 'Roughness':
+           roughness_links.append(link)
+
+    # Replace links with sqrt node
+    for link in roughness_links:
+        from_node = link.from_node
+        from_socket = link.from_socket
+        to_node = link.to_node
+        to_socket = link.to_socket
+
+        nodetree.links.remove(link)
+
+        node = nodetree.nodes.new(type='ShaderNodeMath')
+        node.operation = 'POWER'
+        node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0]);
+        node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1]);
+
+        nodetree.links.new(from_socket, node.inputs[0])
+        node.inputs[1].default_value = 0.5
+        nodetree.links.new(node.outputs['Value'], to_socket)
+
+def square_roughness_nodes_insert():
+    traversed = set()
+    for material in bpy.data.materials:
+        if check_is_new_shading_material(material):
+            square_roughness_node_insert(material, material.node_tree, traversed)
+
+
 def mapping_node_order_flip(node):
     """
     Flip euler order of mapping shader node
@@ -302,3 +403,31 @@ def do_versions(self):
             cscene = scene.cycles
             if not cscene.is_property_set("light_sampling_threshold"):
                 cscene.light_sampling_threshold = 0.0
+
+    if bpy.data.version <= (2, 79, 0):
+        for scene in bpy.data.scenes:
+            cscene = scene.cycles
+            # Default changes
+            if not cscene.is_property_set("aa_samples"):
+                cscene.aa_samples = 4
+            if not cscene.is_property_set("preview_aa_samples"):
+                cscene.preview_aa_samples = 4
+            if not cscene.is_property_set("blur_glossy"):
+                cscene.blur_glossy = 0.0
+            if not cscene.is_property_set("sample_clamp_indirect"):
+                cscene.sample_clamp_indirect = 0.0
+
+    if bpy.data.version <= (2, 79, 1):
+        displacement_nodes_insert()
+
+    if bpy.data.version <= (2, 79, 2):
+        for mat in bpy.data.materials:
+            cmat = mat.cycles
+            if not cmat.is_property_set("displacement_method"):
+                cmat.displacement_method = 'BUMP'
+
+        foreach_cycles_node(displacement_principled_nodes)
+
+    if bpy.data.version <= (2, 79, 3):
+        # Switch to squared roughness convention
+        square_roughness_nodes_insert()
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index f02fc553908..f00ade320e7 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "scene.h"
+#include "render/camera.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_logging.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -81,6 +81,10 @@ struct BlenderCamera {
 	BoundBox2D viewport_camera_border;
 
 	Transform matrix;
+
+	float offscreen_dicing_scale;
+
+	int motion_steps;
 };
 
 static void blender_camera_init(BlenderCamera *bcam,
@@ -104,6 +108,7 @@ static void blender_camera_init(BlenderCamera *bcam,
 	bcam->pano_viewplane.top = 1.0f;
 	bcam->viewport_camera_border.right = 1.0f;
 	bcam->viewport_camera_border.top = 1.0f;
+	bcam->offscreen_dicing_scale = 1.0f;
 
 	/* render resolution */
 	bcam->full_width = render_resolution_x(b_render);
@@ -223,6 +228,8 @@ static void blender_camera_from_object(BlenderCamera *bcam,
 			bcam->sensor_fit = BlenderCamera::HORIZONTAL;
 		else
 			bcam->sensor_fit = BlenderCamera::VERTICAL;
+
+		bcam->motion_steps = object_motion_steps(b_ob, b_ob);
 	}
 	else {
 		/* from lamp not implemented yet */
@@ -243,8 +250,7 @@ static Transform blender_camera_matrix(const Transform& tfm,
 			result = tfm *
 				make_transform(1.0f, 0.0f, 0.0f, 0.0f,
 				               0.0f, 0.0f, 1.0f, 0.0f,
-				               0.0f, 1.0f, 0.0f, 0.0f,
-				               0.0f, 0.0f, 0.0f, 1.0f);
+				               0.0f, 1.0f, 0.0f, 0.0f);
 		}
 		else {
 			/* Make it so environment camera needs to be pointed in the direction
@@ -254,8 +260,7 @@ static Transform blender_camera_matrix(const Transform& tfm,
 			result = tfm *
 				make_transform( 0.0f, -1.0f, 0.0f, 0.0f,
 				                0.0f,  0.0f, 1.0f, 0.0f,
-				               -1.0f,  0.0f, 0.0f, 0.0f,
-				                0.0f,  0.0f, 0.0f, 1.0f);
+				               -1.0f,  0.0f, 0.0f, 0.0f);
 		}
 	}
 	else {
@@ -353,7 +358,11 @@ static void blender_camera_viewplane(BlenderCamera *bcam,
 	}
 }
 
-static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int height, const char *viewname)
+static void blender_camera_sync(Camera *cam,
+                                BlenderCamera *bcam,
+                                int width, int height,
+                                const char *viewname,
+                                PointerRNA *cscene)
 {
 	/* copy camera to compare later */
 	Camera prevcam = *cam;
@@ -448,9 +457,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->matrix = blender_camera_matrix(bcam->matrix,
 	                                    bcam->type,
 	                                    bcam->panorama_type);
-	cam->motion.pre = cam->matrix;
-	cam->motion.post = cam->matrix;
-	cam->use_motion = false;
+	cam->motion.resize(bcam->motion_steps, cam->matrix);
 	cam->use_perspective_motion = false;
 	cam->shuttertime = bcam->shuttertime;
 	cam->fov_pre = cam->fov;
@@ -466,6 +473,9 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->border = bcam->border;
 	cam->viewport_camera_border = bcam->viewport_camera_border;
 
+	bcam->offscreen_dicing_scale = RNA_float_get(cscene, "offscreen_dicing_scale");
+	cam->offscreen_dicing_scale = bcam->offscreen_dicing_scale;
+
 	/* set update flag */
 	if(cam->modified(prevcam))
 		cam->tag_update();
@@ -525,7 +535,21 @@ void BlenderSync::sync_camera(BL::RenderSettings& b_render,
 
 	/* sync */
 	Camera *cam = scene->camera;
-	blender_camera_sync(cam, &bcam, width, height, viewname);
+	blender_camera_sync(cam, &bcam, width, height, viewname, &cscene);
+
+	/* dicing camera */
+	b_ob = BL::Object(RNA_pointer_get(&cscene, "dicing_camera"));
+	if(b_ob) {
+		BL::Array<float, 16> b_ob_matrix;
+		blender_camera_from_object(&bcam, b_engine, b_ob);
+		b_engine.camera_model_matrix(b_ob, bcam.use_spherical_stereo, b_ob_matrix);
+		bcam.matrix = get_transform(b_ob_matrix);
+
+		blender_camera_sync(scene->dicing_camera, &bcam, width, height, viewname, &cscene);
+	}
+	else {
+		*scene->dicing_camera = *cam;
+	}
 }
 
 void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
@@ -542,16 +566,15 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
 	Transform tfm = get_transform(b_ob_matrix);
 	tfm = blender_camera_matrix(tfm, cam->type, cam->panorama_type);
 
-	if(tfm != cam->matrix) {
-		VLOG(1) << "Camera " << b_ob.name() << " motion detected.";
-		if(motion_time == -1.0f) {
-			cam->motion.pre = tfm;
-			cam->use_motion = true;
-		}
-		else if(motion_time == 1.0f) {
-			cam->motion.post = tfm;
-			cam->use_motion = true;
-		}
+	if(motion_time == 0.0f) {
+		/* When motion blur is not centered in frame, cam->matrix gets reset. */
+		cam->matrix = tfm;
+	}
+
+	/* Set transform in motion array. */
+	int motion_step = cam->motion_step(motion_time);
+	if(motion_step >= 0) {
+		cam->motion[motion_step] = tfm;
 	}
 
 	if(cam->type == CAMERA_PERSPECTIVE) {
@@ -573,7 +596,10 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
 		float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio);
 		if(fov != cam->fov) {
 			VLOG(1) << "Camera " << b_ob.name() << " FOV change detected.";
-			if(motion_time == -1.0f) {
+			if(motion_time == 0.0f) {
+				cam->fov = fov;
+			}
+			else if(motion_time == -1.0f) {
 				cam->fov_pre = fov;
 				cam->use_perspective_motion = true;
 			}
@@ -811,7 +837,22 @@ void BlenderSync::sync_view(BL::SpaceView3D& b_v3d,
 	                      b_v3d,
 	                      b_rv3d,
 	                      width, height);
-	blender_camera_sync(scene->camera, &bcam, width, height, "");
+	PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+	blender_camera_sync(scene->camera, &bcam, width, height, "", &cscene);
+
+	/* dicing camera */
+	BL::Object b_ob = BL::Object(RNA_pointer_get(&cscene, "dicing_camera"));
+	if(b_ob) {
+		BL::Array<float, 16> b_ob_matrix;
+		blender_camera_from_object(&bcam, b_engine, b_ob);
+		b_engine.camera_model_matrix(b_ob, bcam.use_spherical_stereo, b_ob_matrix);
+		bcam.matrix = get_transform(b_ob_matrix);
+
+		blender_camera_sync(scene->dicing_camera, &bcam, width, height, "", &cscene);
+	}
+	else {
+		*scene->dicing_camera = *scene->camera;
+	}
 }
 
 BufferParams BlenderSync::get_buffer_params(BL::RenderSettings& b_render,
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index e42ff5d72a6..daccb89f5a2 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -14,18 +14,19 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "camera.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -411,6 +412,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -434,8 +436,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
-			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 2)*resolution;
+			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
 		}
 	}
 
@@ -545,6 +547,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -563,9 +566,12 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
 		return;
 
 	Attribute *attr_intercept = NULL;
+	Attribute *attr_random = NULL;
 
 	if(mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
 		attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT);
+	if(mesh->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
+		attr_random = mesh->curve_attributes.add(ATTR_STD_CURVE_RANDOM);
 
 	/* compute and reserve size of arrays */
 	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
@@ -610,6 +616,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
 				num_curve_keys++;
 			}
 
+			if(attr_random != NULL) {
+				attr_random->add(hash_int_01(num_curves));
+			}
+
 			mesh->add_curve(num_keys, CData->psys_shader[sys]);
 			num_keys += num_curve_keys;
 			num_curves++;
@@ -623,10 +633,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
 	}
 }
 
-static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int time_index)
+static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int motion_step)
 {
 	VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name
-	        << ", time index " << time_index;
+	        << ", motion step " << motion_step;
 
 	/* find attribute */
 	Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
@@ -641,7 +651,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 
 	/* export motion vectors for curve keys */
 	size_t numkeys = mesh->curve_keys.size();
-	float4 *mP = attr_mP->data_float4() + time_index*numkeys;
+	float4 *mP = attr_mP->data_float4() + motion_step*numkeys;
 	bool have_motion = false;
 	int i = 0;
 
@@ -692,12 +702,12 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 			}
 			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 		}
-		else if(time_index > 0) {
-			VLOG(1) << "Filling in new motion vertex position for time_index "
-			        << time_index;
+		else if(motion_step > 0) {
+			VLOG(1) << "Filling in new motion vertex position for motion_step "
+			        << motion_step;
 			/* motion, fill up previous steps that we might have skipped because
 			 * they had no motion, but we need them anyway now */
-			for(int step = 0; step < time_index; step++) {
+			for(int step = 0; step < motion_step; step++) {
 				float4 *mP = attr_mP->data_float4() + step*numkeys;
 
 				for(int key = 0; key < numkeys; key++) {
@@ -774,17 +784,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData,
 
 			for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
 				for(int section = 0; section < resol; section++) {
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
 				}
 			}
@@ -878,7 +888,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
                               BL::Mesh& b_mesh,
                               BL::Object& b_ob,
                               bool motion,
-                              int time_index)
+                              int motion_step)
 {
 	if(!motion) {
 		/* Clear stored curve data */
@@ -890,7 +900,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}
 
 	/* obtain general settings */
-	bool use_curves = scene->curve_system_manager->use_curves;
+	const bool use_curves = scene->curve_system_manager->use_curves;
 
 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -898,11 +908,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}
 
-	int primitive = scene->curve_system_manager->primitive;
-	int triangle_method = scene->curve_system_manager->triangle_method;
-	int resolution = scene->curve_system_manager->resolution;
-	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->num_triangles();
+	const int primitive = scene->curve_system_manager->primitive;
+	const int triangle_method = scene->curve_system_manager->triangle_method;
+	const int resolution = scene->curve_system_manager->resolution;
+	const size_t vert_num = mesh->verts.size();
+	const size_t tri_num = mesh->num_triangles();
 	int used_res = 1;
 
 	/* extract particle hair data - should be combined with connecting to mesh later*/
@@ -941,7 +951,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}
 	else {
 		if(motion)
-			ExportCurveSegmentsMotion(mesh, &CData, time_index);
+			ExportCurveSegmentsMotion(mesh, &CData, motion_step);
 		else
 			ExportCurveSegments(scene, mesh, &CData);
 	}
@@ -1002,7 +1012,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 
 					for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++)
 						if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f))
-							fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]);
+							fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]);
 				}
 			}
 		}
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
index f4f86929168..d0f82e37662 100644
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "CCL_api.h"
-#include "util_logging.h"
+#include "blender/CCL_api.h"
+#include "util/util_logging.h"
 
 void CCL_init_logging(const char *argv0)
 {
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index fdc287084eb..7d6ca18b074 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -14,23 +14,22 @@
  * limitations under the License.
  */
 
- 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/camera.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_algorithm.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
 
 #include "mikktspace.h"
 
@@ -51,8 +50,7 @@ enum {
  * Two triangles has vertex indices in the original Blender-side face.
  * If face is already a quad tri_b will not be initialized.
  */
-inline void face_split_tri_indices(const int num_verts,
-                                   const int face_flag,
+inline void face_split_tri_indices(const int face_flag,
                                    int tri_a[3],
                                    int tri_b[3])
 {
@@ -60,143 +58,253 @@ inline void face_split_tri_indices(const int num_verts,
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 3;
-		if(num_verts == 4) {
-			tri_b[0] = 2;
-			tri_b[1] = 3;
-			tri_b[2] = 1;
-		}
+
+		tri_b[0] = 2;
+		tri_b[1] = 3;
+		tri_b[2] = 1;
 	}
-	else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ {
+	else {
+		/* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 2;
-		if(num_verts == 4) {
-			tri_b[0] = 0;
-			tri_b[1] = 2;
-			tri_b[2] = 3;
-		}
+
+		tri_b[0] = 0;
+		tri_b[1] = 2;
+		tri_b[2] = 3;
 	}
 }
 
 /* Tangent Space */
 
 struct MikkUserData {
-	MikkUserData(const BL::Mesh& mesh_,
-	             BL::MeshTextureFaceLayer *layer_,
-	             int num_faces_)
-	: mesh(mesh_), layer(layer_), num_faces(num_faces_)
+	MikkUserData(const BL::Mesh& b_mesh,
+	             const char *layer_name,
+	             const Mesh *mesh,
+	             float3 *tangent,
+	             float *tangent_sign)
+	        : mesh(mesh),
+	          texface(NULL),
+	          orco(NULL),
+	          tangent(tangent),
+	          tangent_sign(tangent_sign)
 	{
-		tangent.resize(num_faces*4);
+		const AttributeSet& attributes = (mesh->subd_faces.size()) ?
+			mesh->subd_attributes : mesh->attributes;
+
+		Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
+		vertex_normal = attr_vN->data_float3();
+
+		if(layer_name == NULL) {
+			Attribute *attr_orco = attributes.find(ATTR_STD_GENERATED);
+
+			if(attr_orco) {
+				orco = attr_orco->data_float3();
+				mesh_texture_space(*(BL::Mesh*)&b_mesh, orco_loc, orco_size);
+			}
+		}
+		else {
+			Attribute *attr_uv = attributes.find(ustring(layer_name));
+			if(attr_uv != NULL) {
+				texface = attr_uv->data_float3();
+			}
+		}
 	}
 
-	BL::Mesh mesh;
-	BL::MeshTextureFaceLayer *layer;
+	const Mesh *mesh;
 	int num_faces;
-	vector<float4> tangent;
+
+	float3 *vertex_normal;
+	float3 *texface;
+	float3 *orco;
+	float3 orco_loc, orco_size;
+
+	float3 *tangent;
+	float *tangent_sign;
 };
 
 static int mikk_get_num_faces(const SMikkTSpaceContext *context)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	return userdata->num_faces;
+	const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData;
+	if(userdata->mesh->subd_faces.size()) {
+		return userdata->mesh->subd_faces.size();
+	}
+	else {
+		return userdata->mesh->num_triangles();
+	}
 }
 
-static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const int face_num)
+static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context,
+                                      const int face_num)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
-	int4 vi = get_int4(f.vertices_raw());
+	const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData;
+	if(userdata->mesh->subd_faces.size()) {
+		const Mesh *mesh = userdata->mesh;
+		return mesh->subd_faces[face_num].num_corners;
+	}
+	else {
+		return 3;
+	}
+}
 
-	return (vi[3] == 0)? 3: 4;
+static int mikk_vertex_index(const Mesh *mesh, const int face_num, const int vert_num)
+{
+	if(mesh->subd_faces.size()) {
+		const Mesh::SubdFace& face = mesh->subd_faces[face_num];
+		return mesh->subd_face_corners[face.start_corner + vert_num];
+	}
+	else {
+		return mesh->triangles[face_num * 3 + vert_num];
+	}
 }
 
-static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], const int face_num, const int vert_num)
+static int mikk_corner_index(const Mesh *mesh, const int face_num, const int vert_num)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
-	int4 vi = get_int4(f.vertices_raw());
-	BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]];
-	float3 vP = get_float3(v.co());
+	if(mesh->subd_faces.size()) {
+		const Mesh::SubdFace& face = mesh->subd_faces[face_num];
+		return face.start_corner + vert_num;
+	}
+	else {
+		return face_num * 3 + vert_num;
+	}
+}
 
+static void mikk_get_position(const SMikkTSpaceContext *context,
+                              float P[3],
+                              const int face_num, const int vert_num)
+{
+	const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData;
+	const Mesh *mesh = userdata->mesh;
+	const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num);
+	const float3 vP = mesh->verts[vertex_index];
 	P[0] = vP.x;
 	P[1] = vP.y;
 	P[2] = vP.z;
 }
 
-static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float uv[2], const int face_num, const int vert_num)
+static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context,
+                                        float uv[2],
+                                        const int face_num, const int vert_num)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	if(userdata->layer != NULL) {
-		BL::MeshTextureFace tf = userdata->layer->data[face_num];
-		float3 tfuv;
-
-		switch(vert_num) {
-			case 0:
-				tfuv = get_float3(tf.uv1());
-				break;
-			case 1:
-				tfuv = get_float3(tf.uv2());
-				break;
-			case 2:
-				tfuv = get_float3(tf.uv3());
-				break;
-			default:
-				tfuv = get_float3(tf.uv4());
-				break;
-		}
-
+	const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData;
+	const Mesh *mesh = userdata->mesh;
+	if(userdata->texface != NULL) {
+		const int corner_index = mikk_corner_index(mesh, face_num, vert_num);
+		float3 tfuv = userdata->texface[corner_index];
 		uv[0] = tfuv.x;
 		uv[1] = tfuv.y;
 	}
-	else {
-		int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num];
-		float3 orco =
-			get_float3(userdata->mesh.vertices[vert_idx].undeformed_co());
-		float2 tmp = map_to_sphere(make_float3(orco[0], orco[1], orco[2]));
+	else if(userdata->orco != NULL) {
+		const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num);
+		const float3 orco_loc = userdata->orco_loc;
+		const float3 orco_size = userdata->orco_size;
+		const float3 orco = (userdata->orco[vertex_index] + orco_loc) / orco_size;
+
+		const float2 tmp = map_to_sphere(orco);
 		uv[0] = tmp.x;
 		uv[1] = tmp.y;
 	}
+	else {
+		uv[0] = 0.0f;
+		uv[1] = 0.0f;
+	}
 }
 
-static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num)
+static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3],
+                            const int face_num, const int vert_num)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
+	const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData;
+	const Mesh *mesh = userdata->mesh;
 	float3 vN;
-
-	if(f.use_smooth()) {
-		int4 vi = get_int4(f.vertices_raw());
-		BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]];
-		vN = get_float3(v.normal());
+	if(mesh->subd_faces.size()) {
+		const Mesh::SubdFace& face = mesh->subd_faces[face_num];
+		if(face.smooth) {
+			const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num);
+			vN = userdata->vertex_normal[vertex_index];
+		}
+		else {
+			vN = face.normal(mesh);
+		}
 	}
 	else {
-		vN = get_float3(f.normal());
+		if(mesh->smooth[face_num]) {
+			const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num);
+			vN = userdata->vertex_normal[vertex_index];
+		}
+		else {
+			const Mesh::Triangle tri = mesh->get_triangle(face_num);
+			vN = tri.compute_normal(&mesh->verts[0]);
+		}
 	}
-
 	N[0] = vN.x;
 	N[1] = vN.y;
 	N[2] = vN.z;
 }
 
-static void mikk_set_tangent_space(const SMikkTSpaceContext *context, const float T[], const float sign, const int face, const int vert)
+static void mikk_set_tangent_space(const SMikkTSpaceContext *context,
+                                   const float T[],
+                                   const float sign,
+                                   const int face_num, const int vert_num)
 {
-	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-
-	userdata->tangent[face*4 + vert] = make_float4(T[0], T[1], T[2], sign);
+	MikkUserData *userdata = (MikkUserData *)context->m_pUserData;
+	const Mesh *mesh = userdata->mesh;
+	const int corner_index = mikk_corner_index(mesh, face_num, vert_num);
+	userdata->tangent[corner_index] = make_float3(T[0], T[1], T[2]);
+	if(userdata->tangent_sign != NULL) {
+		userdata->tangent_sign[corner_index] = sign;
+	}
 }
 
-static void mikk_compute_tangents(BL::Mesh& b_mesh,
-                                  BL::MeshTextureFaceLayer *b_layer,
+static void mikk_compute_tangents(const BL::Mesh& b_mesh,
+                                  const char *layer_name,
                                   Mesh *mesh,
-                                  const vector<int>& nverts,
-                                  const vector<int>& face_flags,
                                   bool need_sign,
                                   bool active_render)
 {
-	/* setup userdata */
-	MikkUserData userdata(b_mesh, b_layer, nverts.size());
+	/* Create tangent attributes. */
+	AttributeSet& attributes = (mesh->subd_faces.size()) ?
+		mesh->subd_attributes : mesh->attributes;
+	Attribute *attr;
+	ustring name;
+	if(layer_name != NULL) {
+		name = ustring((string(layer_name) + ".tangent").c_str());
+	}
+	else {
+		name = ustring("orco.tangent");
+	}
+	if(active_render) {
+		attr = attributes.add(ATTR_STD_UV_TANGENT, name);
+	}
+	else {
+		attr = attributes.add(name, TypeDesc::TypeVector, ATTR_ELEMENT_CORNER);
+	}
+	float3 *tangent = attr->data_float3();
+	/* Create bitangent sign attribute. */
+	float *tangent_sign = NULL;
+	if(need_sign) {
+		Attribute *attr_sign;
+		ustring name_sign;
+		if(layer_name != NULL) {
+			name_sign = ustring((string(layer_name) +
+			                           ".tangent_sign").c_str());
+		}
+		else {
+			name_sign = ustring("orco.tangent_sign");
+		}
 
-	/* setup interface */
+		if(active_render) {
+			attr_sign = attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign);
+		}
+		else {
+			attr_sign = attributes.add(name_sign,
+			                           TypeDesc::TypeFloat,
+			                           ATTR_ELEMENT_CORNER);
+		}
+		tangent_sign = attr_sign->data_float();
+	}
+	/* Setup userdata. */
+	MikkUserData userdata(b_mesh, layer_name, mesh, tangent, tangent_sign);
+	/* Setup interface. */
 	SMikkTSpaceInterface sm_interface;
 	memset(&sm_interface, 0, sizeof(sm_interface));
 	sm_interface.m_getNumFaces = mikk_get_num_faces;
@@ -205,80 +313,13 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh,
 	sm_interface.m_getTexCoord = mikk_get_texture_coordinate;
 	sm_interface.m_getNormal = mikk_get_normal;
 	sm_interface.m_setTSpaceBasic = mikk_set_tangent_space;
-
-	/* setup context */
+	/* Setup context. */
 	SMikkTSpaceContext context;
 	memset(&context, 0, sizeof(context));
 	context.m_pUserData = &userdata;
 	context.m_pInterface = &sm_interface;
-
-	/* compute tangents */
+	/* Compute tangents. */
 	genTangSpaceDefault(&context);
-
-	/* create tangent attributes */
-	Attribute *attr;
-	ustring name;
-	if(b_layer != NULL)
-		name = ustring((string(b_layer->name().c_str()) + ".tangent").c_str());
-	else
-		name = ustring("orco.tangent");
-
-	if(active_render)
-		attr = mesh->attributes.add(ATTR_STD_UV_TANGENT, name);
-	else
-		attr = mesh->attributes.add(name, TypeDesc::TypeVector, ATTR_ELEMENT_CORNER);
-
-	float3 *tangent = attr->data_float3();
-
-	/* create bitangent sign attribute */
-	float *tangent_sign = NULL;
-
-	if(need_sign) {
-		Attribute *attr_sign;
-		ustring name_sign;
-		if(b_layer != NULL)
-			name_sign = ustring((string(b_layer->name().c_str()) + ".tangent_sign").c_str());
-		else
-			name_sign = ustring("orco.tangent_sign");
-
-		if(active_render)
-			attr_sign = mesh->attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign);
-		else
-			attr_sign = mesh->attributes.add(name_sign, TypeDesc::TypeFloat, ATTR_ELEMENT_CORNER);
-
-		tangent_sign = attr_sign->data_float();
-	}
-
-	for(int i = 0; i < nverts.size(); i++) {
-		int tri_a[3], tri_b[3];
-		face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
-
-		tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]);
-		tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]);
-		tangent[2] = float4_to_float3(userdata.tangent[i*4 + tri_a[2]]);
-		tangent += 3;
-
-		if(tangent_sign) {
-			tangent_sign[0] = userdata.tangent[i*4 + tri_a[0]].w;
-			tangent_sign[1] = userdata.tangent[i*4 + tri_a[1]].w;
-			tangent_sign[2] = userdata.tangent[i*4 + tri_a[2]].w;
-			tangent_sign += 3;
-		}
-
-		if(nverts[i] == 4) {
-			tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_b[0]]);
-			tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_b[1]]);
-			tangent[2] = float4_to_float3(userdata.tangent[i*4 + tri_b[2]]);
-			tangent += 3;
-
-			if(tangent_sign) {
-				tangent_sign[0] = userdata.tangent[i*4 + tri_b[0]].w;
-				tangent_sign[1] = userdata.tangent[i*4 + tri_b[1]].w;
-				tangent_sign[2] = userdata.tangent[i*4 + tri_b[2]].w;
-				tangent_sign += 3;
-			}
-		}
-	}
 }
 
 /* Create Volume Attribute */
@@ -293,11 +334,14 @@ static void create_mesh_volume_attribute(BL::Object& b_ob,
 
 	if(!b_domain)
 		return;
-	
+
+	mesh->volume_isovalue = b_domain.clipping();
+
 	Attribute *attr = mesh->attributes.add(std);
 	VoxelAttribute *volume_data = attr->data_voxel();
-	bool is_float, is_linear;
+	ImageMetaData metadata;
 	bool animated = false;
+	bool use_alpha = true;
 
 	volume_data->manager = image_manager;
 	volume_data->slot = image_manager->add_image(
@@ -305,11 +349,10 @@ static void create_mesh_volume_attribute(BL::Object& b_ob,
 	        b_ob.ptr.data,
 	        animated,
 	        frame,
-	        is_float,
-	        is_linear,
 	        INTERPOLATION_LINEAR,
 	        EXTENSION_CLIP,
-	        true);
+	        use_alpha,
+	        metadata);
 }
 
 static void create_mesh_volume_attributes(Scene *scene,
@@ -326,6 +369,8 @@ static void create_mesh_volume_attributes(Scene *scene,
 		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame);
 	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT))
 		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame);
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_TEMPERATURE))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_TEMPERATURE, frame);
 	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY))
 		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame);
 }
@@ -356,7 +401,7 @@ static void attr_create_vertex_color(Scene *scene,
 				int n = p->loop_total();
 				for(int i = 0; i < n; i++) {
 					float3 color = get_float3(l->data[p->loop_start() + i].color());
-					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color));
+					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color));
 				}
 			}
 		}
@@ -377,14 +422,14 @@ static void attr_create_vertex_color(Scene *scene,
 
 			for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
 				int tri_a[3], tri_b[3];
-				face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+				face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 				uchar4 colors[4];
-				colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
-				colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
-				colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+				colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1())));
+				colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2())));
+				colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3())));
 				if(nverts[i] == 4) {
-					colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+					colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4())));
 				}
 
 				cdata[0] = colors[tri_a[0]];
@@ -409,68 +454,50 @@ static void attr_create_uv_map(Scene *scene,
                                Mesh *mesh,
                                BL::Mesh& b_mesh,
                                const vector<int>& nverts,
-                               const vector<int>& face_flags,
-                               bool subdivision,
-                               bool subdivide_uvs)
+                               const vector<int>& face_flags)
 {
-	if(subdivision) {
-		BL::Mesh::uv_layers_iterator l;
-		int i = 0;
-
-		for(b_mesh.uv_layers.begin(l); l != b_mesh.uv_layers.end(); ++l, ++i) {
-			bool active_render = b_mesh.uv_textures[i].active_render();
-			AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
-			ustring name = ustring(l->name().c_str());
-
-			/* UV map */
-			if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
-				Attribute *attr;
-
-				if(active_render)
-					attr = mesh->subd_attributes.add(std, name);
-				else
-					attr = mesh->subd_attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
-
-				if(subdivide_uvs) {
-					attr->flags |= ATTR_SUBDIVIDED;
-				}
-
-				BL::Mesh::polygons_iterator p;
-				float3 *fdata = attr->data_float3();
-
-				for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
-					int n = p->loop_total();
-					for(int j = 0; j < n; j++) {
-						*(fdata++) = get_float3(l->data[p->loop_start() + j].uv());
-					}
-				}
-			}
-		}
-	}
-	else if(b_mesh.tessface_uv_textures.length() != 0) {
+	if(b_mesh.tessface_uv_textures.length() != 0) {
 		BL::Mesh::tessface_uv_textures_iterator l;
 
 		for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
-			bool active_render = l->active_render();
-			AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
-			ustring name = ustring(l->name().c_str());
+			const bool active_render = l->active_render();
+			AttributeStandard uv_std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
+			ustring uv_name = ustring(l->name().c_str());
+			AttributeStandard tangent_std = (active_render)? ATTR_STD_UV_TANGENT
+			                                               : ATTR_STD_NONE;
+			ustring tangent_name = ustring(
+			        (string(l->name().c_str()) + ".tangent").c_str());
+
+			/* Denotes whether UV map was requested directly. */
+			const bool need_uv = mesh->need_attribute(scene, uv_name) ||
+			                     mesh->need_attribute(scene, uv_std);
+			/* Denotes whether tangent was requested directly. */
+			const bool need_tangent =
+			       mesh->need_attribute(scene, tangent_name) ||
+			       (active_render && mesh->need_attribute(scene, tangent_std));
 
 			/* UV map */
-			if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
-				Attribute *attr;
-
-				if(active_render)
-					attr = mesh->attributes.add(std, name);
-				else
-					attr = mesh->attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+			/* NOTE: We create temporary UV layer if its needed for tangent but
+			 * wasn't requested by other nodes in shaders.
+			 */
+			Attribute *uv_attr = NULL;
+			if(need_uv || need_tangent) {
+				if(active_render) {
+					uv_attr = mesh->attributes.add(uv_std, uv_name);
+				}
+				else {
+					uv_attr = mesh->attributes.add(uv_name,
+					                               TypeDesc::TypePoint,
+					                               ATTR_ELEMENT_CORNER);
+				}
 
 				BL::MeshTextureFaceLayer::data_iterator t;
-				float3 *fdata = attr->data_float3();
+				float3 *fdata = uv_attr->data_float3();
 				size_t i = 0;
 
 				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
 					int tri_a[3], tri_b[3];
-					face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+					face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 					float3 uvs[4];
 					uvs[0] = get_float3(t->uv1());
@@ -495,33 +522,112 @@ static void attr_create_uv_map(Scene *scene,
 			}
 
 			/* UV tangent */
-			std = (active_render)? ATTR_STD_UV_TANGENT: ATTR_STD_NONE;
-			name = ustring((string(l->name().c_str()) + ".tangent").c_str());
+			if(need_tangent) {
+				AttributeStandard sign_std =
+				        (active_render)? ATTR_STD_UV_TANGENT_SIGN
+				                       : ATTR_STD_NONE;
+				ustring sign_name = ustring(
+				        (string(l->name().c_str()) + ".tangent_sign").c_str());
+				bool need_sign = (mesh->need_attribute(scene, sign_name) ||
+				                  mesh->need_attribute(scene, sign_std));
+				mikk_compute_tangents(b_mesh,
+				                      l->name().c_str(),
+				                      mesh,
+				                      need_sign,
+				                      active_render);
+			}
+			/* Remove temporarily created UV attribute. */
+			if(!need_uv && uv_attr != NULL) {
+				mesh->attributes.remove(uv_attr);
+			}
+		}
+	}
+	else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) {
+		bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN);
+		mikk_compute_tangents(b_mesh, NULL, mesh, need_sign, true);
+		if(!mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+			mesh->attributes.remove(ATTR_STD_GENERATED);
+		}
+	}
+}
+
+static void attr_create_subd_uv_map(Scene *scene,
+                                    Mesh *mesh,
+                                    BL::Mesh& b_mesh,
+                                    bool subdivide_uvs)
+{
+	if(b_mesh.uv_layers.length() != 0) {
+		BL::Mesh::uv_layers_iterator l;
+		int i = 0;
+
+		for(b_mesh.uv_layers.begin(l); l != b_mesh.uv_layers.end(); ++l, ++i) {
+			bool active_render = b_mesh.uv_textures[i].active_render();
+			AttributeStandard uv_std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
+			ustring uv_name = ustring(l->name().c_str());
+			AttributeStandard tangent_std = (active_render)? ATTR_STD_UV_TANGENT
+			                                               : ATTR_STD_NONE;
+			ustring tangent_name = ustring(
+			        (string(l->name().c_str()) + ".tangent").c_str());
+
+			/* Denotes whether UV map was requested directly. */
+			const bool need_uv = mesh->need_attribute(scene, uv_name) ||
+			                     mesh->need_attribute(scene, uv_std);
+			/* Denotes whether tangent was requested directly. */
+			const bool need_tangent =
+			       mesh->need_attribute(scene, tangent_name) ||
+			       (active_render && mesh->need_attribute(scene, tangent_std));
+
+			Attribute *uv_attr = NULL;
+
+			/* UV map */
+			if(need_uv || need_tangent) {
+				if(active_render)
+					uv_attr = mesh->subd_attributes.add(uv_std, uv_name);
+				else
+					uv_attr = mesh->subd_attributes.add(uv_name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
 
-			if(mesh->need_attribute(scene, name) || (active_render && mesh->need_attribute(scene, std))) {
-				std = (active_render)? ATTR_STD_UV_TANGENT_SIGN: ATTR_STD_NONE;
-				name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str());
-				bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std));
+				if(subdivide_uvs) {
+					uv_attr->flags |= ATTR_SUBDIVIDED;
+				}
+
+				BL::Mesh::polygons_iterator p;
+				float3 *fdata = uv_attr->data_float3();
 
+				for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+					int n = p->loop_total();
+					for(int j = 0; j < n; j++) {
+						*(fdata++) = get_float3(l->data[p->loop_start() + j].uv());
+					}
+				}
+			}
+
+			/* UV tangent */
+			if(need_tangent) {
+				AttributeStandard sign_std =
+				        (active_render)? ATTR_STD_UV_TANGENT_SIGN
+				                       : ATTR_STD_NONE;
+				ustring sign_name = ustring(
+				        (string(l->name().c_str()) + ".tangent_sign").c_str());
+				bool need_sign = (mesh->need_attribute(scene, sign_name) ||
+				                  mesh->need_attribute(scene, sign_std));
 				mikk_compute_tangents(b_mesh,
-				                      &(*l),
+				                      l->name().c_str(),
 				                      mesh,
-				                      nverts,
-				                      face_flags,
 				                      need_sign,
 				                      active_render);
 			}
+			/* Remove temporarily created UV attribute. */
+			if(!need_uv && uv_attr != NULL) {
+				mesh->subd_attributes.remove(uv_attr);
+			}
 		}
 	}
 	else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) {
 		bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN);
-		mikk_compute_tangents(b_mesh,
-		                      NULL,
-		                      mesh,
-		                      nverts,
-		                      face_flags,
-		                      need_sign,
-		                      true);
+		mikk_compute_tangents(b_mesh, NULL, mesh, need_sign, true);
+		if(!mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+			mesh->subd_attributes.remove(ATTR_STD_GENERATED);
+		}
 	}
 }
 
@@ -560,6 +666,9 @@ static void attr_create_pointiness(Scene *scene,
 		return;
 	}
 	const int num_verts = b_mesh.vertices.length();
+	if(num_verts == 0) {
+		return;
+	}
 	/* STEP 1: Find out duplicated vertices and point duplicates to a single
 	 *         original vertex.
 	 */
@@ -588,8 +697,8 @@ static void attr_create_pointiness(Scene *scene,
 			        sorted_vert_indeices[other_sorted_vert_index];
 			const float3 &other_vert_co = mesh->verts[other_vert_index];
 			/* We are too far away now, we wouldn't have duplicate. */
-			if ((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
-			    (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
+			if((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
+			   (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
 			{
 				break;
 			}
@@ -717,6 +826,11 @@ static void create_mesh(Scene *scene,
 	int numngons = 0;
 	bool use_loop_normals = b_mesh.use_auto_smooth() && (mesh->subdivision_type != Mesh::SUBDIVISION_CATMULL_CLARK);
 
+	/* If no faces, create empty mesh. */
+	if(numfaces == 0) {
+		return;
+	}
+
 	BL::Mesh::vertices_iterator v;
 	BL::Mesh::tessfaces_iterator f;
 	BL::Mesh::polygons_iterator p;
@@ -751,7 +865,13 @@ static void create_mesh(Scene *scene,
 	N = attr_N->data_float3();
 
 	/* create generated coordinates from undeformed coordinates */
-	if(mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+	const bool need_default_tangent =
+	        (subdivision == false) &&
+	        (b_mesh.tessface_uv_textures.length() == 0) &&
+	        (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT));
+	if(mesh->need_attribute(scene, ATTR_STD_GENERATED) ||
+	   need_default_tangent)
+	{
 		Attribute *attr = attributes.add(ATTR_STD_GENERATED);
 		attr->flags |= ATTR_SUBDIVIDED;
 
@@ -761,8 +881,9 @@ static void create_mesh(Scene *scene,
 		float3 *generated = attr->data_float3();
 		size_t i = 0;
 
-		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v)
+		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v) {
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
+		}
 	}
 
 	/* create faces */
@@ -819,7 +940,7 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(p->material_index(), 0, used_shaders.size()-1);
 			bool smooth = p->use_smooth() || use_loop_normals;
 
-			vi.reserve(n);
+			vi.resize(n);
 			for(int i = 0; i < n; i++) {
 				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
@@ -835,7 +956,13 @@ static void create_mesh(Scene *scene,
 	 */
 	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
-	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);
+
+	if(subdivision) {
+		attr_create_subd_uv_map(scene, mesh, b_mesh, subdivide_uvs);
+	}
+	else {
+		attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags);
+	}
 
 	/* for volume objects, create a matrix to transform from object space to
 	 * mesh texture space. this does not work with deformations but that can
@@ -898,8 +1025,8 @@ static void create_subd_mesh(Scene *scene,
 	sdparams.dicing_rate = max(0.1f, RNA_float_get(&cobj, "dicing_rate") * dicing_rate);
 	sdparams.max_level = max_subdivisions;
 
-	scene->camera->update();
-	sdparams.camera = scene->camera;
+	scene->dicing_camera->update(scene);
+	sdparams.camera = scene->dicing_camera;
 	sdparams.objecttoworld = get_transform(b_ob.matrix_world());
 }
 
@@ -979,7 +1106,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 		else
 			used_shaders.push_back(scene->default_surface);
 	}
-	
+
 	/* test if we need to sync */
 	int requested_geometry_flags = Mesh::GEOMETRY_NONE;
 	if(render_layer.use_surfaces) {
@@ -1003,7 +1130,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 			bool attribute_recalc = false;
 
 			foreach(Shader *shader, mesh->used_shaders)
-				if(shader->need_update_attributes)
+				if(shader->need_update_mesh)
 					attribute_recalc = true;
 
 			if(!attribute_recalc)
@@ -1014,16 +1141,23 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	/* ensure we only sync instanced meshes once */
 	if(mesh_synced.find(mesh) != mesh_synced.end())
 		return mesh;
-	
+
 	mesh_synced.insert(mesh);
 
 	/* create derived mesh */
-	array<int> oldtriangle = mesh->triangles;
-	
+	array<int> oldtriangles;
+	array<Mesh::SubdFace> oldsubd_faces;
+	array<int> oldsubd_face_corners;
+	oldtriangles.steal_data(mesh->triangles);
+	oldsubd_faces.steal_data(mesh->subd_faces);
+	oldsubd_face_corners.steal_data(mesh->subd_face_corners);
+
 	/* compares curve_keys rather than strands in order to handle quick hair
 	 * adjustments in dynamic BVH - other methods could probably do this better*/
-	array<float3> oldcurve_keys = mesh->curve_keys;
-	array<float> oldcurve_radius = mesh->curve_radius;
+	array<float3> oldcurve_keys;
+	array<float> oldcurve_radius;
+	oldcurve_keys.steal_data(mesh->curve_keys);
+	oldcurve_radius.steal_data(mesh->curve_radius);
 
 	mesh->clear();
 	mesh->used_shaders = used_shaders;
@@ -1077,7 +1211,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 			}
 
 			/* free derived mesh */
-			b_data.meshes.remove(b_mesh, false);
+			b_data.meshes.remove(b_mesh, false, true, false);
 		}
 	}
 	mesh->geometry_flags = requested_geometry_flags;
@@ -1086,29 +1220,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	sync_mesh_fluid_motion(b_ob, scene, mesh);
 
 	/* tag update */
-	bool rebuild = false;
-
-	if(oldtriangle.size() != mesh->triangles.size())
-		rebuild = true;
-	else if(oldtriangle.size()) {
-		if(memcmp(&oldtriangle[0], &mesh->triangles[0], sizeof(int)*oldtriangle.size()) != 0)
-			rebuild = true;
-	}
+	bool rebuild = (oldtriangles != mesh->triangles) ||
+	               (oldsubd_faces != mesh->subd_faces) ||
+	               (oldsubd_face_corners != mesh->subd_face_corners) ||
+	               (oldcurve_keys != mesh->curve_keys) ||
+	               (oldcurve_radius != mesh->curve_radius);
 
-	if(oldcurve_keys.size() != mesh->curve_keys.size())
-		rebuild = true;
-	else if(oldcurve_keys.size()) {
-		if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(float3)*oldcurve_keys.size()) != 0)
-			rebuild = true;
-	}
-
-	if(oldcurve_radius.size() != mesh->curve_radius.size())
-		rebuild = true;
-	else if(oldcurve_radius.size()) {
-		if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0)
-			rebuild = true;
-	}
-	
 	mesh->tag_update(scene, rebuild);
 
 	return mesh;
@@ -1131,45 +1248,19 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 	if(mesh_synced.find(mesh) == mesh_synced.end())
 		return;
 
-	/* for motion pass always compute, for motion blur it can be disabled */
-	int time_index = 0;
-
-	if(scene->need_motion() == Scene::MOTION_BLUR) {
-		if(!mesh->use_motion_blur)
-			return;
-		
-		/* see if this mesh needs motion data at this time */
-		vector<float> object_times = object->motion_times();
-		bool found = false;
-
-		foreach(float object_time, object_times) {
-			if(motion_time == object_time) {
-				found = true;
-				break;
-			}
-			else
-				time_index++;
-		}
-
-		if(!found)
-			return;
-	}
-	else {
-		if(motion_time == -1.0f)
-			time_index = 0;
-		else if(motion_time == 1.0f)
-			time_index = 1;
-		else
-			return;
+	/* Find time matching motion step required by mesh. */
+	int motion_step = mesh->motion_step(motion_time);
+	if(motion_step < 0) {
+		return;
 	}
 
 	/* skip empty meshes */
-	size_t numverts = mesh->verts.size();
-	size_t numkeys = mesh->curve_keys.size();
+	const size_t numverts = mesh->verts.size();
+	const size_t numkeys = mesh->curve_keys.size();
 
 	if(!numverts && !numkeys)
 		return;
-	
+
 	/* skip objects without deforming modifiers. this is not totally reliable,
 	 * would need a more extensive check to see which objects are animated */
 	BL::Mesh b_mesh(PointerRNA_NULL);
@@ -1202,9 +1293,9 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 				float3 *P = &mesh->verts[0];
 				float3 *N = (attr_N)? attr_N->data_float3(): NULL;
 
-				memcpy(attr_mP->data_float3() + time_index*numverts, P, sizeof(float3)*numverts);
+				memcpy(attr_mP->data_float3() + motion_step*numverts, P, sizeof(float3)*numverts);
 				if(attr_mN)
-					memcpy(attr_mN->data_float3() + time_index*numverts, N, sizeof(float3)*numverts);
+					memcpy(attr_mN->data_float3() + motion_step*numverts, N, sizeof(float3)*numverts);
 			}
 		}
 
@@ -1214,7 +1305,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 			if(attr_mP) {
 				float3 *keys = &mesh->curve_keys[0];
-				memcpy(attr_mP->data_float3() + time_index*numkeys, keys, sizeof(float3)*numkeys);
+				memcpy(attr_mP->data_float3() + motion_step*numkeys, keys, sizeof(float3)*numkeys);
 			}
 		}
 
@@ -1223,13 +1314,12 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 	/* TODO(sergey): Perform preliminary check for number of verticies. */
 	if(numverts) {
-		/* find attributes */
+		/* Find attributes. */
 		Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 		Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
 		Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
 		bool new_attribute = false;
-
-		/* add new attributes if they don't exist already */
+		/* Add new attributes if they don't exist already. */
 		if(!attr_mP) {
 			attr_mP = mesh->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
 			if(attr_N)
@@ -1237,22 +1327,21 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 			new_attribute = true;
 		}
-
-		/* load vertex data from mesh */
-		float3 *mP = attr_mP->data_float3() + time_index*numverts;
-		float3 *mN = (attr_mN)? attr_mN->data_float3() + time_index*numverts: NULL;
-
+		/* Load vertex data from mesh. */
+		float3 *mP = attr_mP->data_float3() + motion_step*numverts;
+		float3 *mN = (attr_mN)? attr_mN->data_float3() + motion_step*numverts: NULL;
+		/* NOTE: We don't copy more that existing amount of vertices to prevent
+		 * possible memory corruption.
+		 */
 		BL::Mesh::vertices_iterator v;
 		int i = 0;
-
 		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end() && i < numverts; ++v, ++i) {
 			mP[i] = get_float3(v->co());
 			if(mN)
 				mN[i] = get_float3(v->normal());
 		}
-
-		/* in case of new attribute, we verify if there really was any motion */
 		if(new_attribute) {
+			/* In case of new attribute, we verify if there really was any motion. */
 			if(b_mesh.vertices.length() != numverts ||
 			   memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0)
 			{
@@ -1269,28 +1358,37 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 				if(attr_mN)
 					mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL);
 			}
-			else if(time_index > 0) {
+			else if(motion_step > 0) {
 				VLOG(1) << "Filling deformation motion for object " << b_ob.name();
 				/* motion, fill up previous steps that we might have skipped because
 				 * they had no motion, but we need them anyway now */
 				float3 *P = &mesh->verts[0];
 				float3 *N = (attr_N)? attr_N->data_float3(): NULL;
-
-				for(int step = 0; step < time_index; step++) {
+				for(int step = 0; step < motion_step; step++) {
 					memcpy(attr_mP->data_float3() + step*numverts, P, sizeof(float3)*numverts);
 					if(attr_mN)
 						memcpy(attr_mN->data_float3() + step*numverts, N, sizeof(float3)*numverts);
 				}
 			}
 		}
+		else {
+			if(b_mesh.vertices.length() != numverts) {
+				VLOG(1) << "Topology differs, discarding motion blur for object "
+				        << b_ob.name() << " at time " << motion_step;
+				memcpy(mP, &mesh->verts[0], sizeof(float3)*numverts);
+				if(mN != NULL) {
+					memcpy(mN, attr_N->data_float3(), sizeof(float3)*numverts);
+				}
+			}
+		}
 	}
 
 	/* hair motion */
 	if(numkeys)
-		sync_curves(mesh, b_mesh, b_ob, true, time_index);
+		sync_curves(mesh, b_mesh, b_ob, true, motion_step);
 
 	/* free derived mesh */
-	b_data.meshes.remove(b_mesh, false);
+	b_data.meshes.remove(b_mesh, false, true, false);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 637cf7abda8..077ceb4ebef 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -14,24 +14,24 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "integrator.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "nodes.h"
-#include "particles.h"
-#include "shader.h"
-
-#include "blender_object_cull.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_logging.h"
+#include "render/camera.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/nodes.h"
+#include "render/particles.h"
+#include "render/shader.h"
+
+#include "blender/blender_object_cull.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -63,8 +63,26 @@ bool BlenderSync::object_is_mesh(BL::Object& b_ob)
 {
 	BL::ID b_ob_data = b_ob.data();
 
-	return (b_ob_data && (b_ob_data.is_a(&RNA_Mesh) ||
-		b_ob_data.is_a(&RNA_Curve) || b_ob_data.is_a(&RNA_MetaBall)));
+	if(!b_ob_data) {
+		return false;
+	}
+
+	if(b_ob.type() == BL::Object::type_CURVE) {
+		/* Skip exporting curves without faces, overhead can be
+		 * significant if there are many for path animation. */
+		BL::Curve b_curve(b_ob.data());
+
+		return (b_curve.bevel_object() ||
+		        b_curve.extrude() != 0.0f ||
+		        b_curve.bevel_depth() != 0.0f ||
+		        b_curve.dimensions() == BL::Curve::dimensions_2D ||
+		        b_ob.modifiers.length());
+	}
+	else {
+		return (b_ob_data.is_a(&RNA_Mesh) ||
+		        b_ob_data.is_a(&RNA_Curve) ||
+		        b_ob_data.is_a(&RNA_MetaBall));
+	}
 }
 
 bool BlenderSync::object_is_light(BL::Object& b_ob)
@@ -94,6 +112,7 @@ static uint object_ray_visibility(BL::Object& b_ob)
 void BlenderSync::sync_light(BL::Object& b_parent,
                              int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
                              BL::Object& b_ob,
+                             BL::DupliObject& b_dupli_ob,
                              Transform& tfm,
                              bool *use_portal)
 {
@@ -175,6 +194,13 @@ void BlenderSync::sync_light(BL::Object& b_parent,
 
 	light->max_bounces = get_int(clamp, "max_bounces");
 
+	if(b_dupli_ob) {
+		light->random_id = b_dupli_ob.random_id();
+	}
+	else {
+		light->random_id = hash_int_2d(hash_string(b_ob.name().c_str()), 0);
+	}
+
 	if(light->type == LIGHT_AREA)
 		light->is_portal = get_boolean(clamp, "is_portal");
 	else
@@ -253,7 +279,7 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	if(object_is_light(b_ob)) {
 		/* don't use lamps for excluded layers used as mask layer */
 		if(!motion && !((layer_flag & render_layer.holdout_layer) && (layer_flag & render_layer.exclude_layer)))
-			sync_light(b_parent, persistent_id, b_ob, tfm, use_portal);
+			sync_light(b_parent, persistent_id, b_ob, b_dupli_ob, tfm, use_portal);
 
 		return NULL;
 	}
@@ -268,6 +294,31 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		return NULL;
 	}
 
+	/* Visibility flags for both parent and child. */
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0 ||
+	                   get_boolean(cobject, "is_holdout");
+	uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY;
+
+	if(b_parent.ptr.data != b_ob.ptr.data) {
+		visibility &= object_ray_visibility(b_parent);
+	}
+
+	/* Make holdout objects on excluded layer invisible for non-camera rays. */
+	if(use_holdout && (layer_flag & render_layer.exclude_layer)) {
+		visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
+	}
+
+	/* Hide objects not on render layer from camera rays. */
+	if(!(layer_flag & render_layer.layer)) {
+		visibility &= ~PATH_RAY_CAMERA;
+	}
+
+	/* Don't export completely invisible objects. */
+	if(visibility == 0) {
+		return NULL;
+	}
+
 	/* key to lookup object */
 	ObjectKey key(b_parent, persistent_id, b_ob);
 	Object *object;
@@ -276,22 +327,11 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	if(motion) {
 		object = object_map.find(key);
 
-		if(object && (scene->need_motion() == Scene::MOTION_PASS ||
-		              object_use_motion(b_parent, b_ob)))
-		{
-			/* object transformation */
-			if(tfm != object->tfm) {
-				VLOG(1) << "Object " << b_ob.name() << " motion detected.";
-				if(motion_time == -1.0f || motion_time == 1.0f) {
-					object->use_motion = true;
-				}
-			}
-
-			if(motion_time == -1.0f) {
-				object->motion.pre = tfm;
-			}
-			else if(motion_time == 1.0f) {
-				object->motion.post = tfm;
+		if(object && object->use_motion()) {
+			/* Set transform at matching motion time step. */
+			int time_index = object->motion_step(motion_time);
+			if(time_index >= 0) {
+				object->motion[time_index] = tfm;
 			}
 
 			/* mesh deformation */
@@ -308,8 +348,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	if(object_map.sync(&object, b_ob, b_parent, key))
 		object_updated = true;
 	
-	bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0;
-	
 	/* mesh sync */
 	object->mesh = sync_mesh(b_ob, object_updated, hide_tris);
 
@@ -322,27 +360,17 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}
 
-	/* visibility flags for both parent and child */
-	uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY;
-	if(b_parent.ptr.data != b_ob.ptr.data) {
-		visibility &= object_ray_visibility(b_parent);
-	}
-
-	/* make holdout objects on excluded layer invisible for non-camera rays */
-	if(use_holdout && (layer_flag & render_layer.exclude_layer)) {
-		visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
-	}
-
-	/* hide objects not on render layer from camera rays */
-	if(!(layer_flag & render_layer.layer)) {
-		visibility &= ~PATH_RAY_CAMERA;
-	}
-
 	if(visibility != object->visibility) {
 		object->visibility = visibility;
 		object_updated = true;
 	}
 
+	bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher");
+	if(is_shadow_catcher != object->is_shadow_catcher) {
+		object->is_shadow_catcher = is_shadow_catcher;
+		object_updated = true;
+	}
+
 	/* object sync
 	 * transform comparison should not be needed, but duplis don't work perfect
 	 * in the depsgraph and may not signal changes, so this is a workaround */
@@ -350,49 +378,50 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object->name = b_ob.name().c_str();
 		object->pass_id = b_ob.pass_index();
 		object->tfm = tfm;
-		object->motion.pre = transform_empty();
-		object->motion.post = transform_empty();
-		object->use_motion = false;
+		object->motion.clear();
 
 		/* motion blur */
-		if(scene->need_motion() == Scene::MOTION_BLUR && object->mesh) {
+		Scene::MotionType need_motion = scene->need_motion();
+		if(need_motion != Scene::MOTION_NONE && object->mesh) {
 			Mesh *mesh = object->mesh;
-
 			mesh->use_motion_blur = false;
+			mesh->motion_steps = 0;
 
-			if(object_use_motion(b_parent, b_ob)) {
-				if(object_use_deform_motion(b_parent, b_ob)) {
-					mesh->motion_steps = object_motion_steps(b_ob);
+			uint motion_steps;
+
+			if(scene->need_motion() == Scene::MOTION_BLUR) {
+				motion_steps = object_motion_steps(b_parent, b_ob);
+				if(motion_steps && object_use_deform_motion(b_parent, b_ob)) {
+					mesh->motion_steps = motion_steps;
 					mesh->use_motion_blur = true;
 				}
-
-				vector<float> times = object->motion_times();
-				foreach(float time, times)
-					motion_times.insert(time);
 			}
-		}
+			else {
+				motion_steps = 3;
+				mesh->motion_steps = motion_steps;
+			}
 
-		/* random number */
-		object->random_id = hash_string(object->name.c_str());
+			object->motion.resize(motion_steps, transform_empty());
 
-		if(persistent_id) {
-			for(int i = 0; i < OBJECT_PERSISTENT_ID_SIZE; i++)
-				object->random_id = hash_int_2d(object->random_id, persistent_id[i]);
-		}
-		else
-			object->random_id = hash_int_2d(object->random_id, 0);
+			if(motion_steps) {
+				object->motion[motion_steps/2] = tfm;
 
-		if(b_parent.ptr.data != b_ob.ptr.data)
-			object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
+				for(size_t step = 0; step < motion_steps; step++) {
+					motion_times.insert(object->motion_time(step));
+				}
+			}
+		}
 
-		/* dupli texture coordinates */
+		/* dupli texture coordinates and random_id */
 		if(b_dupli_ob) {
 			object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f);
 			object->dupli_uv = get_float2(b_dupli_ob.uv());
+			object->random_id = b_dupli_ob.random_id();
 		}
 		else {
 			object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f);
 			object->dupli_uv = make_float2(0.0f, 0.0f);
+			object->random_id =  hash_int_2d(hash_string(object->name.c_str()), 0);
 		}
 
 		object->tag_update(scene);
@@ -482,7 +511,7 @@ static bool object_render_hide_duplis(BL::Object& b_ob)
 
 /* Object Loop */
 
-void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
+void BlenderSync::sync_objects(float motion_time)
 {
 	/* layer data */
 	uint scene_layer = render_layer.scene_layer;
@@ -510,7 +539,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 	 * 1 : DAG_EVAL_PREVIEW
 	 * 2 : DAG_EVAL_RENDER
 	 */
-	int dupli_settings = preview ? 1 : 2;
+	int dupli_settings = (render_layer.use_viewport_visibility) ? 1 : 2;
 
 	bool cancel = false;
 	bool use_portal = false;
@@ -545,7 +574,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 					for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) {
 						Transform tfm = get_transform(b_dup->matrix());
 						BL::Object b_dup_ob = b_dup->object();
-						bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render();
+						bool dup_hide = (render_layer.use_viewport_visibility)? b_dup_ob.hide(): b_dup_ob.hide_render();
 						bool in_dupli_group = (b_dup->type() == BL::DupliObject::type_GROUP);
 						bool hide_tris;
 
@@ -621,7 +650,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 }
 
 void BlenderSync::sync_motion(BL::RenderSettings& b_render,
-                              BL::SpaceView3D& b_v3d,
                               BL::Object& b_override,
                               int width, int height,
                               void **python_thread_state)
@@ -658,7 +686,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
 		b_engine.frame_set(frame, subframe);
 		python_thread_state_save(python_thread_state);
 		sync_camera_motion(b_render, b_cam, width, height, 0.0f);
-		sync_objects(b_v3d, 0.0f);
+		sync_objects(0.0f);
 	}
 
 	/* always sample these times for camera motion */
@@ -667,6 +695,11 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
 
 	/* note iteration over motion_times set happens in sorted order */
 	foreach(float relative_time, motion_times) {
+		/* center time is already handled. */
+		if(relative_time == 0.0f) {
+			continue;
+		}
+
 		VLOG(1) << "Synchronizing motion for the relative time "
 		        << relative_time << ".";
 
@@ -692,7 +725,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
 		}
 
 		/* sync object */
-		sync_objects(b_v3d, relative_time);
+		sync_objects(relative_time);
 	}
 
 	/* we need to set the python thread state again because this
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
index 08918dd1a49..bdf7dc469b2 100644
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -16,9 +16,9 @@
 
 #include <cstdlib>
 
-#include "camera.h"
+#include "render/camera.h"
 
-#include "blender_object_cull.h"
+#include "blender/blender_object_cull.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,7 +62,7 @@ void BlenderObjectCulling::init_object(Scene *scene, BL::Object& b_ob)
 
 	if(use_camera_cull_ || use_distance_cull_) {
 		/* Need to have proper projection matrix. */
-		scene->camera->update();
+		scene->camera->update(scene);
 	}
 }
 
@@ -96,7 +96,7 @@ bool BlenderObjectCulling::test(Scene *scene, BL::Object& b_ob, Transform& tfm)
 bool BlenderObjectCulling::test_camera(Scene *scene, float3 bb[8])
 {
 	Camera *cam = scene->camera;
-	Transform& worldtondc = cam->worldtondc;
+	const ProjectionTransform& worldtondc = cam->worldtondc;
 	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
 	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
 	bool all_behind = true;
diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h
index b6f0ca5cd31..2147877a860 100644
--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_OBJECT_CULL_H__
 #define __BLENDER_OBJECT_CULL_H__
 
-#include "blender_sync.h"
-#include "util_types.h"
+#include "blender/blender_sync.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index dd2900a8d5b..00f8cb3cf1b 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "particles.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/particles.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f88..792597cbad5 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -16,21 +16,22 @@
 
 #include <Python.h>
 
-#include "CCL_api.h"
+#include "blender/CCL_api.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 #ifdef WITH_OSL
-#include "osl.h"
+#include "render/osl.h"
 
 #include <OSL/oslquery.h>
 #include <OSL/oslconfig.h>
@@ -60,15 +61,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	/* Backup some settings for comparison. */
 	DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
 	DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type;
+	/* Synchronize shared flags. */
+	flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
 	/* Synchronize CPU flags. */
 	flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
 	flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx");
 	flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41");
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
-	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
+	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
@@ -104,6 +109,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024;
+	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
 }
@@ -641,7 +648,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 	Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
+static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
 {
 	int num_resumable_chunks, current_resumable_chunk;
 	if(!PyArg_ParseTuple(args, "ii",
@@ -676,6 +683,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }
 
+static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
+{
+	int num_chunks, start_chunk, end_chunk;
+	if(!PyArg_ParseTuple(args, "iii",
+	                     &num_chunks,
+	                     &start_chunk,
+	                     &end_chunk)) {
+		Py_RETURN_NONE;
+	}
+
+	if(num_chunks <= 0) {
+		fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk < 1 || start_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(end_chunk < 1 || end_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk > end_chunk) {
+		fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+
+	VLOG(1) << "Initialized resumable render: "
+	        << "num_resumable_chunks=" << num_chunks << ", "
+	        << "start_resumable_chunk=" << start_chunk
+	        << "end_resumable_chunk=" << end_chunk;
+	BlenderSession::num_resumable_chunks = num_chunks;
+	BlenderSession::start_resumable_chunk = start_chunk;
+	BlenderSession::end_resumable_chunk = end_chunk;
+
+	printf("Cycles: Will render chunks %d to %d of %d\n",
+	       start_chunk,
+	       end_chunk,
+	       num_chunks);
+
+	Py_RETURN_NONE;
+}
+
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -715,7 +769,8 @@ static PyMethodDef methods[] = {
 	{"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""},
 
 	/* Resumable render */
-	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},
+	{"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
+	{"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
 
 	/* Compute Device selection */
 	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
@@ -760,6 +815,14 @@ void *CCL_python_module_init()
 	PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+	PyModule_AddObject(mod, "with_cycles_debug", Py_True);
+	Py_INCREF(Py_True);
+#else
+	PyModule_AddObject(mod, "with_cycles_debug", Py_False);
+	Py_INCREF(Py_False);
+#endif
+
 #ifdef WITH_NETWORK
 	PyModule_AddObject(mod, "with_network", Py_True);
 	Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 2f30cbd961f..00d23b9095e 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -16,36 +16,38 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "shader.h"
-
-#include "util_color.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_hash.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_time.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "render/background.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/shader.h"
+
+#include "util/util_color.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_time.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
 
 bool BlenderSession::headless = false;
 int BlenderSession::num_resumable_chunks = 0;
 int BlenderSession::current_resumable_chunk = 0;
+int BlenderSession::start_resumable_chunk = 0;
+int BlenderSession::end_resumable_chunk = 0;
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
                                BL::UserPreferences& b_userpref,
@@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = true;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
@@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = false;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::~BlenderSession()
@@ -111,8 +115,7 @@ void BlenderSession::create()
 void BlenderSession::create_session()
 {
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	/* reset status/progress */
@@ -121,14 +124,6 @@ void BlenderSession::create_session()
 	last_progress = -1.0f;
 	start_resize_time = 0.0;
 
-	/* create scene */
-	scene = new Scene(scene_params, session_params.device);
-
-	/* setup callbacks for builtin image support */
-	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
-
 	/* create session */
 	session = new Session(session_params);
 	session->scene = scene;
@@ -136,8 +131,18 @@ void BlenderSession::create_session()
 	session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
 	session->set_pause(session_pause);
 
+	/* create scene */
+	scene = new Scene(scene_params, session->device);
+
+	/* setup callbacks for builtin image support */
+	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5);
+
+	session->scene = scene;
+
 	/* create sync */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
 	BL::Object b_camera_override(b_engine.camera_override());
 	if(b_v3d) {
 		if(session_pause == false) {
@@ -175,8 +180,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_)
 	b_scene = b_scene_;
 
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 
 	width = render_resolution_x(b_render);
 	height = render_resolution_y(b_render);
@@ -207,7 +211,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_)
 	session->stats.mem_peak = session->stats.mem_used;
 
 	/* sync object should be re-created */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
 
 	/* for final render we will do full data sync per render layer, only
 	 * do some basic syncing here, no objects or materials for speed */
@@ -239,90 +243,6 @@ void BlenderSession::free_session()
 	delete session;
 }
 
-static PassType get_pass_type(BL::RenderPass& b_pass)
-{
-	switch(b_pass.type()) {
-		case BL::RenderPass::type_COMBINED:
-			return PASS_COMBINED;
-
-		case BL::RenderPass::type_Z:
-			return PASS_DEPTH;
-		case BL::RenderPass::type_MIST:
-			return PASS_MIST;
-		case BL::RenderPass::type_NORMAL:
-			return PASS_NORMAL;
-		case BL::RenderPass::type_OBJECT_INDEX:
-			return PASS_OBJECT_ID;
-		case BL::RenderPass::type_UV:
-			return PASS_UV;
-		case BL::RenderPass::type_VECTOR:
-			return PASS_MOTION;
-		case BL::RenderPass::type_MATERIAL_INDEX:
-			return PASS_MATERIAL_ID;
-
-		case BL::RenderPass::type_DIFFUSE_DIRECT:
-			return PASS_DIFFUSE_DIRECT;
-		case BL::RenderPass::type_GLOSSY_DIRECT:
-			return PASS_GLOSSY_DIRECT;
-		case BL::RenderPass::type_TRANSMISSION_DIRECT:
-			return PASS_TRANSMISSION_DIRECT;
-		case BL::RenderPass::type_SUBSURFACE_DIRECT:
-			return PASS_SUBSURFACE_DIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_INDIRECT:
-			return PASS_DIFFUSE_INDIRECT;
-		case BL::RenderPass::type_GLOSSY_INDIRECT:
-			return PASS_GLOSSY_INDIRECT;
-		case BL::RenderPass::type_TRANSMISSION_INDIRECT:
-			return PASS_TRANSMISSION_INDIRECT;
-		case BL::RenderPass::type_SUBSURFACE_INDIRECT:
-			return PASS_SUBSURFACE_INDIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_COLOR:
-			return PASS_DIFFUSE_COLOR;
-		case BL::RenderPass::type_GLOSSY_COLOR:
-			return PASS_GLOSSY_COLOR;
-		case BL::RenderPass::type_TRANSMISSION_COLOR:
-			return PASS_TRANSMISSION_COLOR;
-		case BL::RenderPass::type_SUBSURFACE_COLOR:
-			return PASS_SUBSURFACE_COLOR;
-
-		case BL::RenderPass::type_EMIT:
-			return PASS_EMISSION;
-		case BL::RenderPass::type_ENVIRONMENT:
-			return PASS_BACKGROUND;
-		case BL::RenderPass::type_AO:
-			return PASS_AO;
-		case BL::RenderPass::type_SHADOW:
-			return PASS_SHADOW;
-
-		case BL::RenderPass::type_DIFFUSE:
-		case BL::RenderPass::type_COLOR:
-		case BL::RenderPass::type_REFRACTION:
-		case BL::RenderPass::type_SPECULAR:
-		case BL::RenderPass::type_REFLECTION:
-			return PASS_NONE;
-#ifdef WITH_CYCLES_DEBUG
-		case BL::RenderPass::type_DEBUG:
-		{
-			switch(b_pass.debug_type()) {
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
-					return PASS_BVH_TRAVERSED_NODES;
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
-					return PASS_BVH_TRAVERSED_INSTANCES;
-				case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
-					return PASS_BVH_INTERSECTIONS;
-				case BL::RenderPass::debug_type_RAY_BOUNCES:
-					return PASS_RAY_BOUNCES;
-			}
-			break;
-		}
-#endif
-	}
-	
-	return PASS_NONE;
-}
-
 static ShaderEvalType get_shader_type(const string& pass_type)
 {
 	const char *shader_type = pass_type.c_str();
@@ -332,6 +252,8 @@ static ShaderEvalType get_shader_type(const string& pass_type)
 		return SHADER_EVAL_NORMAL;
 	else if(strcmp(shader_type, "UV")==0)
 		return SHADER_EVAL_UV;
+	else if(strcmp(shader_type, "ROUGHNESS")==0)
+		return SHADER_EVAL_ROUGHNESS;
 	else if(strcmp(shader_type, "DIFFUSE_COLOR")==0)
 		return SHADER_EVAL_DIFFUSE_COLOR;
 	else if(strcmp(shader_type, "GLOSSY_COLOR")==0)
@@ -379,18 +301,18 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
 static void end_render_result(BL::RenderEngine& b_engine,
                               BL::RenderResult& b_rr,
                               bool cancel,
+                              bool highlight,
                               bool do_merge_results)
 {
-	b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+	b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
 {
-	BufferParams& params = rtile.buffers->params;
-	int x = params.full_x - session->tile_manager.params.full_x;
-	int y = params.full_y - session->tile_manager.params.full_y;
-	int w = params.width;
-	int h = params.height;
+	int x = rtile.x - session->tile_manager.params.full_x;
+	int y = rtile.y - session->tile_manager.params.full_y;
+	int w = rtile.w;
+	int h = rtile.h;
 
 	/* get render result */
 	BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
@@ -410,47 +332,47 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 	BL::RenderLayer b_rlay = *b_single_rlay;
 
 	if(do_update_only) {
-		/* update only needed */
+		/* Sample would be zero at initial tile update, which is only needed
+		 * to tag tile form blender side as IN PROGRESS for proper highlight
+		 * no buffers should be sent to blender yet. For denoise we also
+		 * keep showing the noisy buffers until denoise is done. */
+		bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
 
-		if(rtile.sample != 0) {
-			/* sample would be zero at initial tile update, which is only needed
-			 * to tag tile form blender side as IN PROGRESS for proper highlight
-			 * no buffers should be sent to blender yet
-			 */
+		if(merge) {
 			update_render_result(b_rr, b_rlay, rtile);
 		}
 
-		end_render_result(b_engine, b_rr, true, true);
+		end_render_result(b_engine, b_rr, true, highlight, merge);
 	}
 	else {
-		/* write result */
+		/* Write final render result. */
 		write_render_result(b_rr, b_rlay, rtile);
-		end_render_result(b_engine, b_rr, false, true);
+		end_render_result(b_engine, b_rr, false, false, true);
 	}
 }
 
 void BlenderSession::write_render_tile(RenderTile& rtile)
 {
-	do_write_update_render_tile(rtile, false);
+	do_write_update_render_tile(rtile, false, false);
 }
 
-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
 {
 	/* use final write for preview renders, otherwise render result wouldn't be
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
 	if(!b_engine.is_preview())
-		do_write_update_render_tile(rtile, true);
+		do_write_update_render_tile(rtile, true, highlight);
 	else
-		do_write_update_render_tile(rtile, false);
+		do_write_update_render_tile(rtile, false, false);
 }
 
 void BlenderSession::render()
 {
 	/* set callback to write out render results */
 	session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -460,7 +382,10 @@ void BlenderSession::render()
 	BL::RenderSettings r = b_scene.render();
 	BL::RenderSettings::layers_iterator b_layer_iter;
 	BL::RenderResult::views_iterator b_view_iter;
-	
+
+	/* We do some special meta attributes when we only have single layer. */
+	const bool is_single_layer = (r.layers.length() == 1);
+
 	for(r.layers.begin(b_layer_iter); b_layer_iter != r.layers.end(); ++b_layer_iter) {
 		b_rlay_name = b_layer_iter->name();
 
@@ -471,33 +396,38 @@ void BlenderSession::render()
 
 		/* layer will be missing if it was disabled in the UI */
 		if(b_single_rlay == b_rr.layers.end()) {
-			end_render_result(b_engine, b_rr, true, false);
+			end_render_result(b_engine, b_rr, true, true, false);
 			continue;
 		}
 
 		BL::RenderLayer b_rlay = *b_single_rlay;
 
 		/* add passes */
-		array<Pass> passes;
-		Pass::add(PASS_COMBINED, passes);
-
-		if(session_params.device.advanced_shading) {
-
-			/* loop over passes */
-			BL::RenderLayer::passes_iterator b_pass_iter;
-
-			for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
-				BL::RenderPass b_pass(*b_pass_iter);
-				PassType pass_type = get_pass_type(b_pass);
+		array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
+		buffer_params.passes = passes;
 
-				if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
-					continue;
-				if(pass_type != PASS_NONE)
-					Pass::add(pass_type, passes);
-			}
-		}
+		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+		bool use_denoising = get_boolean(crl, "use_denoising");
+		buffer_params.denoising_data_pass = use_denoising;
+		session->tile_manager.schedule_denoising = use_denoising;
+		session->params.use_denoising = use_denoising;
+		scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+		scene->film->denoising_flags = 0;
+		if(!get_boolean(crl, "denoising_diffuse_direct"))        scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+		if(!get_boolean(crl, "denoising_diffuse_indirect"))      scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+		if(!get_boolean(crl, "denoising_glossy_direct"))         scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+		if(!get_boolean(crl, "denoising_glossy_indirect"))       scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+		if(!get_boolean(crl, "denoising_transmission_direct"))   scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+		if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+		if(!get_boolean(crl, "denoising_subsurface_direct"))     scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+		if(!get_boolean(crl, "denoising_subsurface_indirect"))   scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+		scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+		buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+		session->params.denoising_radius = get_int(crl, "denoising_radius");
+		session->params.denoising_strength = get_float(crl, "denoising_strength");
+		session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+		session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
 
-		buffer_params.passes = passes;
 		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
@@ -550,8 +480,17 @@ void BlenderSession::render()
 				break;
 		}
 
+		if(is_single_layer) {
+			BL::RenderResult b_rr = b_engine.get_result();
+			string num_aa_samples = string_printf("%d", session->params.samples);
+			b_rr.stamp_data_add_field("Cycles Samples", num_aa_samples.c_str());
+			/* TODO(sergey): Report whether we're doing resumable render
+			 * and also start/end sample if so.
+			 */
+		}
+
 		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
+		end_render_result(b_engine, b_rr, true, true, false);
 
 		if(session->progress.get_cancel())
 			break;
@@ -632,8 +571,6 @@ void BlenderSession::bake(BL::Object& b_object,
                           float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
-	size_t object_index = OBJECT_NONE;
-	int tri_offset = 0;
 
 	/* Set baking flag in advance, so kernel loading can check if we need
 	 * any baking capabilities.
@@ -643,9 +580,6 @@ void BlenderSession::bake(BL::Object& b_object,
 	/* ensure kernels are loaded before we do any scene updates */
 	session->load_kernels();
 
-	if(session->progress.get_cancel())
-		return;
-
 	if(shader_type == SHADER_EVAL_UV) {
 		/* force UV to be available */
 		Pass::add(PASS_UV, scene->film->passes);
@@ -663,50 +597,61 @@ void BlenderSession::bake(BL::Object& b_object,
 	scene->film->tag_update(scene);
 	scene->integrator->tag_update(scene);
 
-	/* update scene */
-	BL::Object b_camera_override(b_engine.camera_override());
-	sync->sync_camera(b_render, b_camera_override, width, height, "");
-	sync->sync_data(b_render,
-	                b_v3d,
-	                b_camera_override,
-	                width, height,
-	                &python_thread_state,
-	                b_rlay_name.c_str());
+	if(!session->progress.get_cancel()) {
+		/* update scene */
+		BL::Object b_camera_override(b_engine.camera_override());
+		sync->sync_camera(b_render, b_camera_override, width, height, "");
+		sync->sync_data(b_render,
+						b_v3d,
+						b_camera_override,
+						width, height,
+						&python_thread_state,
+						b_rlay_name.c_str());
+	}
 
-	/* get buffer parameters */
-	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
+	BakeData *bake_data = NULL;
 
-	scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
+	if(!session->progress.get_cancel()) {
+		/* get buffer parameters */
+		SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
-	/* set number of samples */
-	session->tile_manager.set_samples(session_params.samples);
-	session->reset(buffer_params, session_params.samples);
-	session->update_scene();
+		scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
 
-	/* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
-	for(size_t i = 0; i < scene->objects.size(); i++) {
-		if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
-			object_index = i;
-			tri_offset = scene->objects[i]->mesh->tri_offset;
-			break;
-		}
-	}
+		/* set number of samples */
+		session->tile_manager.set_samples(session_params.samples);
+		session->reset(buffer_params, session_params.samples);
+		session->update_scene();
+
+		/* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
+		size_t object_index = OBJECT_NONE;
+		int tri_offset = 0;
 
-	int object = object_index;
+		for(size_t i = 0; i < scene->objects.size(); i++) {
+			if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
+				object_index = i;
+				tri_offset = scene->objects[i]->mesh->tri_offset;
+				break;
+			}
+		}
 
-	BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+		int object = object_index;
 
-	populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
+		bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+		populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
 
-	/* set number of samples */
-	session->tile_manager.set_samples(session_params.samples);
-	session->reset(buffer_params, session_params.samples);
-	session->update_scene();
+		/* set number of samples */
+		session->tile_manager.set_samples(session_params.samples);
+		session->reset(buffer_params, session_params.samples);
+		session->update_scene();
 
-	session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+		session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+	}
 
-	scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+	/* Perform bake. Check cancel to avoid crash with incomplete scene data. */
+	if(!session->progress.get_cancel()) {
+		scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+	}
 
 	/* free all memory used (host and device), so we wouldn't leave render
 	 * engine with extra memory allocated
@@ -729,10 +674,9 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 	if(!buffers->copy_from_device())
 		return;
 
-	BufferParams& params = buffers->params;
 	float exposure = scene->film->exposure;
 
-	vector<float> pixels(params.width*params.height*4);
+	vector<float> pixels(rtile.w*rtile.h*4);
 
 	/* Adjust absolute sample number to the range. */
 	int sample = rtile.sample;
@@ -749,19 +693,31 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 			BL::RenderPass b_pass(*b_iter);
 
 			/* find matching pass type */
-			PassType pass_type = get_pass_type(b_pass);
+			PassType pass_type = BlenderSync::get_pass_type(b_pass);
 			int components = b_pass.channels();
 
-			/* copy pixels */
-			if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]))
+			bool read = false;
+			if(pass_type != PASS_NONE) {
+				/* copy pixels */
+				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
+			}
+			else {
+				int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+				if(denoising_offset >= 0) {
+					read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+				}
+			}
+
+			if(!read) {
 				memset(&pixels[0], 0, pixels.size()*sizeof(float));
+			}
 
 			b_pass.rect(&pixels[0]);
 		}
 	}
 	else {
 		/* copy combined pass */
-		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
 		if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0]))
 			b_combined_pass.rect(&pixels[0]);
 	}
@@ -792,8 +748,7 @@ void BlenderSession::synchronize()
 
 	/* on session/scene parameter changes, we recreate session entirely */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	if(session->params.modified(session_params) ||
@@ -989,10 +944,14 @@ void BlenderSession::update_status_progress()
 	if(substatus.size() > 0)
 		status += " | " + substatus;
 
-	if(status != last_status) {
+	double current_time = time_dt();
+	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
+	 * For headless rendering, only report when something significant changes to keep the console output readable. */
+	if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
 		b_engine.update_stats("", (timestatus + scene + status).c_str());
 		b_engine.update_memory_stats(mem_used, mem_peak);
 		last_status = status;
+		last_status_time = current_time;
 	}
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1061,18 +1020,11 @@ int BlenderSession::builtin_image_frame(const string &builtin_name)
 
 void BlenderSession::builtin_image_info(const string &builtin_name,
                                         void *builtin_data,
-                                        bool &is_float,
-                                        int &width,
-                                        int &height,
-                                        int &depth,
-                                        int &channels)
+                                        ImageMetaData& metadata)
 {
 	/* empty image */
-	is_float = false;
-	width = 1;
-	height = 1;
-	depth = 0;
-	channels = 0;
+	metadata.width = 1;
+	metadata.height = 1;
 
 	if(!builtin_data)
 		return;
@@ -1086,32 +1038,34 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 		/* image data */
 		BL::Image b_image(b_id);
 
-		is_float = b_image.is_float();
-		width = b_image.size()[0];
-		height = b_image.size()[1];
-		depth = 1;
-		channels = b_image.channels();
+		metadata.builtin_free_cache = !b_image.has_data();
+		metadata.is_float = b_image.is_float();
+		metadata.width = b_image.size()[0];
+		metadata.height = b_image.size()[1];
+		metadata.depth = 1;
+		metadata.channels = b_image.channels();
 	}
 	else if(b_id.is_a(&RNA_Object)) {
 		/* smoke volume data */
 		BL::Object b_ob(b_id);
 		BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
 
-		is_float = true;
-		depth = 1;
-		channels = 1;
+		metadata.is_float = true;
+		metadata.depth = 1;
+		metadata.channels = 1;
 
 		if(!b_domain)
 			return;
 
 		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY) ||
 		   builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME) ||
-		   builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT))
-			channels = 1;
+		   builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT) ||
+		   builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE))
+			metadata.channels = 1;
 		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR))
-			channels = 4;
+			metadata.channels = 4;
 		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY))
-			channels = 3;
+			metadata.channels = 3;
 		else
 			return;
 
@@ -1125,9 +1079,9 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 			amplify = 1;
 		}
 
-		width = resolution.x * amplify;
-		height = resolution.y * amplify;
-		depth = resolution.z * amplify;
+		metadata.width = resolution.x * amplify;
+		metadata.height = resolution.y * amplify;
+		metadata.depth = resolution.z * amplify;
 	}
 	else {
 		/* TODO(sergey): Check we're indeed in shader node tree. */
@@ -1136,9 +1090,11 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 		BL::Node b_node(ptr);
 		if(b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
 			BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
-			channels = 4;
-			width = height = depth = b_point_density_node.resolution();
-			is_float = true;
+			metadata.channels = 4;
+			metadata.width = b_point_density_node.resolution();
+			metadata.height = metadata.width;
+			metadata.depth = metadata.width;
+			metadata.is_float = true;
 		}
 	}
 }
@@ -1146,7 +1102,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 bool BlenderSession::builtin_image_pixels(const string &builtin_name,
                                           void *builtin_data,
                                           unsigned char *pixels,
-                                          const size_t pixels_size)
+                                          const size_t pixels_size,
+                                          const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1167,7 +1124,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 
 	if(image_pixels && num_pixels * channels == pixels_size) {
 		memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
-		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
@@ -1186,6 +1142,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 			}
 		}
 	}
+
+	if(image_pixels) {
+		MEM_freeN(image_pixels);
+	}
+
+	/* Free image buffers to save memory during render. */
+	if(free_cache) {
+		b_image.buffers_free();
+	}
+
 	/* Premultiply, byte images are always straight for Blender. */
 	unsigned char *cp = pixels;
 	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
@@ -1199,7 +1165,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
                                                 void *builtin_data,
                                                 float *pixels,
-                                                const size_t pixels_size)
+                                                const size_t pixels_size,
+                                                const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1224,7 +1191,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 
 		if(image_pixels && num_pixels * channels == pixels_size) {
 			memcpy(pixels, image_pixels, pixels_size * sizeof(float));
-			MEM_freeN(image_pixels);
 		}
 		else {
 			if(channels == 1) {
@@ -1244,6 +1210,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 			}
 		}
 
+		if(image_pixels) {
+			MEM_freeN(image_pixels);
+		}
+
+		/* Free image buffers to save memory during render. */
+		if(free_cache) {
+			b_image.buffers_free();
+		}
+
 		return true;
 	}
 	else if(b_id.is_a(&RNA_Object)) {
@@ -1308,6 +1283,13 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 				return true;
 			}
 		}
+		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) {
+			SmokeDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length);
+			if(length == num_pixels) {
+				SmokeDomainSettings_temperature_grid_get(&b_domain.ptr, pixels);
+				return true;
+			}
+		}
 		else {
 			fprintf(stderr,
 			        "Cycles error: unknown volume attribute %s, skipping\n",
@@ -1342,9 +1324,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 		return;
 	}
 
-	int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
-	int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-	int range_num_samples = num_samples_per_chunk;
+	const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
+
+	int range_start_sample, range_num_samples;
+	if(current_resumable_chunk != 0) {
+		/* Single chunk rendering. */
+		range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
+		range_num_samples = num_samples_per_chunk;
+	}
+	else {
+		/* Ranged-chunks. */
+		const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
+		range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
+		range_num_samples = num_chunks * num_samples_per_chunk;
+	}
+	/* Make sure we don't overshoot. */
 	if(range_start_sample + range_num_samples > num_samples) {
 		range_num_samples = num_samples - range_num_samples;
 	}
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 82fe218b4ce..3804e07cffc 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,15 +17,16 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__
 
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class ImageMetaData;
 class Scene;
 class Session;
 class RenderBuffers;
@@ -79,7 +80,7 @@ public:
 	void update_render_result(BL::RenderResult& b_rr,
 	                          BL::RenderLayer& b_rlay,
 	                          RenderTile& rtile);
-	void update_render_tile(RenderTile& rtile);
+	void update_render_tile(RenderTile& rtile, bool highlight);
 
 	/* interactive updates */
 	void synchronize();
@@ -113,6 +114,7 @@ public:
 	string last_status;
 	string last_error;
 	float last_progress;
+	double last_status_time;
 
 	int width, height;
 	double start_resize_time;
@@ -137,29 +139,31 @@ public:
 	/* Current resumable chunk index to render. */
 	static int current_resumable_chunk;
 
+	/* Alternative to single-chunk rendering to render a range of chunks. */
+	static int start_resumable_chunk;
+	static int end_resumable_chunk;
+
 protected:
 	void do_write_update_render_result(BL::RenderResult& b_rr,
 	                                   BL::RenderLayer& b_rlay,
 	                                   RenderTile& rtile,
 	                                   bool do_update_only);
-	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
 
 	int builtin_image_frame(const string &builtin_name);
 	void builtin_image_info(const string &builtin_name,
 	                        void *builtin_data,
-	                        bool &is_float,
-	                        int &width,
-	                        int &height,
-	                        int &depth,
-	                        int &channels);
+	                        ImageMetaData& metadata);
 	bool builtin_image_pixels(const string &builtin_name,
 	                          void *builtin_data,
 	                          unsigned char *pixels,
-	                          const size_t pixels_size);
+	                          const size_t pixels_size,
+	                          const bool free_cache);
 	bool builtin_image_float_pixels(const string &builtin_name,
 	                                void *builtin_data,
 	                                float *pixels,
-	                                const size_t pixels_size);
+	                                const size_t pixels_size,
+	                                const bool free_cache);
 
 	/* Update tile manager to reflect resumable render settings. */
 	void update_resumable_tile_manager(int num_samples);
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 8baa53fc2ec..eb9968a85c2 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -14,20 +14,23 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "graph.h"
-#include "light.h"
-#include "nodes.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "blender_texture.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_string.h"
+#include "render/background.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/nodes.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "blender/blender_texture.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_set.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -430,6 +433,9 @@ static ShaderNode *add_node(Scene *scene,
 			case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
 				subsurface->falloff = CLOSURE_BSSRDF_BURLEY_ID;
 				break;
+			case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK:
+				subsurface->falloff = CLOSURE_BSSRDF_RANDOM_WALK_ID;
+				break;
 		}
 
 		node = subsurface;
@@ -518,6 +524,27 @@ static ShaderNode *add_node(Scene *scene,
 		}
 		node = hair;
 	}
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfPrincipled)) {
+		BL::ShaderNodeBsdfPrincipled b_principled_node(b_node);
+		PrincipledBsdfNode *principled = new PrincipledBsdfNode();
+		switch (b_principled_node.distribution()) {
+			case BL::ShaderNodeBsdfPrincipled::distribution_GGX:
+				principled->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+				break;
+			case BL::ShaderNodeBsdfPrincipled::distribution_MULTI_GGX:
+				principled->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+				break;
+		}
+		switch (b_principled_node.subsurface_method()) {
+			case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY:
+				principled->subsurface_method = CLOSURE_BSSRDF_PRINCIPLED_ID;
+				break;
+			case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK:
+				principled->subsurface_method = CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID;
+				break;
+		}
+		node = principled;
+	}
 	else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
 		node = new TranslucentBsdfNode();
 	}
@@ -539,6 +566,10 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) {
 		node = new AbsorptionVolumeNode();
 	}
+	else if(b_node.is_a(&RNA_ShaderNodeVolumePrincipled)) {
+		PrincipledVolumeNode *principled = new PrincipledVolumeNode();
+		node = principled;
+	}
 	else if(b_node.is_a(&RNA_ShaderNodeNewGeometry)) {
 		node = new GeometryNode();
 	}
@@ -850,6 +881,25 @@ static ShaderNode *add_node(Scene *scene,
 			        transform_inverse(get_transform(b_ob.matrix_world()));
 		}
 	}
+	else if(b_node.is_a(&RNA_ShaderNodeBevel)) {
+		BL::ShaderNodeBevel b_bevel_node(b_node);
+		BevelNode *bevel = new BevelNode();
+		bevel->samples = b_bevel_node.samples();
+		node = bevel;
+	}
+	else if(b_node.is_a(&RNA_ShaderNodeDisplacement)) {
+		BL::ShaderNodeDisplacement b_disp_node(b_node);
+		DisplacementNode *disp = new DisplacementNode();
+		disp->space = (NodeNormalMapSpace)b_disp_node.space();
+		node = disp;
+	}
+	else if(b_node.is_a(&RNA_ShaderNodeVectorDisplacement)) {
+		BL::ShaderNodeVectorDisplacement b_disp_node(b_node);
+		VectorDisplacementNode *disp = new VectorDisplacementNode();
+		disp->space = (NodeNormalMapSpace)b_disp_node.space();
+		disp->attribute = "";
+		node = disp;
+	}
 
 	if(node) {
 		node->name = b_node.name();
@@ -978,6 +1028,10 @@ static void add_nodes(Scene *scene,
 			for(b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
 				BL::NodeSocket to_socket(b_link->to_socket());
 				SocketType::Type to_socket_type = convert_socket_type(to_socket);
+				if (to_socket_type == SocketType::UNDEFINED) {
+					continue;
+				}
+
 				ConvertNode *proxy = new ConvertNode(to_socket_type, to_socket_type, true);
 
 				input_map[b_link->from_socket().ptr.data] = proxy->inputs[0];
@@ -1001,6 +1055,10 @@ static void add_nodes(Scene *scene,
 			 */
 			for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 				SocketType::Type input_type = convert_socket_type(*b_input);
+				if (input_type == SocketType::UNDEFINED) {
+					continue;
+				}
+
 				ConvertNode *proxy = new ConvertNode(input_type, input_type, true);
 				graph->add(proxy);
 
@@ -1013,6 +1071,10 @@ static void add_nodes(Scene *scene,
 			}
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				SocketType::Type output_type = convert_socket_type(*b_output);
+				if (output_type == SocketType::UNDEFINED) {
+					continue;
+				}
+
 				ConvertNode *proxy = new ConvertNode(output_type, output_type, true);
 				graph->add(proxy);
 
@@ -1164,6 +1226,9 @@ void BlenderSync::sync_materials(bool update_all)
 	/* material loop */
 	BL::BlendData::materials_iterator b_mat;
 
+	TaskPool pool;
+	set<Shader*> updated_shaders;
+
 	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
 		Shader *shader;
 
@@ -1196,12 +1261,40 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
 			shader->volume_sampling_method = get_volume_sampling(cmat);
 			shader->volume_interpolation_method = get_volume_interpolation(cmat);
-			shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP;
+			shader->displacement_method = get_displacement_method(cmat);
 
 			shader->set_graph(graph);
-			shader->tag_update(scene);
+
+			/* By simplifying the shader graph as soon as possible, some
+			 * redundant shader nodes might be removed which prevents loading
+			 * unnecessary attributes later.
+			 *
+			 * However, since graph simplification also accounts for e.g. mix
+			 * weight, this would cause frequent expensive resyncs in interactive
+			 * sessions, so for those sessions optimization is only performed
+			 * right before compiling.
+			 */
+			if(!preview) {
+				pool.push(function_bind(&ShaderGraph::simplify, graph, scene));
+				/* NOTE: Update shaders out of the threads since those routines
+				 * are accessing and writing to a global context.
+				 */
+				updated_shaders.insert(shader);
+			}
+			else {
+				/* NOTE: Update tagging can access links which are being
+				 * optimized out.
+				 */
+				shader->tag_update(scene);
+			}
 		}
 	}
+
+	pool.wait_work();
+
+	foreach(Shader *shader, updated_shaders) {
+		shader->tag_update(scene);
+	}
 }
 
 /* Sync World */
@@ -1242,11 +1335,8 @@ void BlenderSync::sync_world(bool update_all)
 			/* AO */
 			BL::WorldLighting b_light = b_world.light_settings();
 
-			if(b_light.use_ambient_occlusion())
-				background->ao_factor = b_light.ao_factor();
-			else
-				background->ao_factor = 0.0f;
-
+			background->use_ao = b_light.use_ambient_occlusion();
+			background->ao_factor = b_light.ao_factor();
 			background->ao_distance = b_light.distance();
 
 			/* visibility */
@@ -1262,6 +1352,7 @@ void BlenderSync::sync_world(bool update_all)
 			background->visibility = visibility;
 		}
 		else {
+			background->use_ao = false;
 			background->ao_factor = 0.0f;
 			background->ao_distance = FLT_MAX;
 		}
@@ -1282,8 +1373,17 @@ void BlenderSync::sync_world(bool update_all)
 	else
 		background->transparent = b_scene.render().alpha_mode() == BL::RenderSettings::alpha_mode_TRANSPARENT;
 
+	if(background->transparent) {
+		background->transparent_glass = get_boolean(cscene, "film_transparent_glass");
+		background->transparent_roughness_threshold = get_float(cscene, "film_transparent_roughness");
+	}
+	else {
+		background->transparent_glass = false;
+		background->transparent_roughness_threshold = 0.0f;
+	}
+
 	background->use_shader = render_layer.use_background_shader;
-	background->use_ao = render_layer.use_background_ao;
+	background->use_ao = background->use_ao && render_layer.use_background_ao;
 
 	if(background->modified(prevbackground))
 		background->tag_update(scene);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index d8043105cd8..283aa5600fd 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-#include "curves.h"
-
-#include "device.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_opengl.h"
-#include "util_hash.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/curves.h"
+
+#include "device/device.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_opengl.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -47,8 +47,7 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine,
                          BL::Scene& b_scene,
                          Scene *scene,
                          bool preview,
-                         Progress &progress,
-                         bool is_cpu)
+                         Progress &progress)
 : b_engine(b_engine),
   b_data(b_data),
   b_scene(b_scene),
@@ -62,7 +61,6 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine,
   scene(scene),
   preview(preview),
   experimental(false),
-  is_cpu(is_cpu),
   dicing_rate(1.0f),
   max_subdivisions(12),
   progress(progress)
@@ -210,10 +208,9 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render,
 	   scene->need_motion() == Scene::MOTION_NONE ||
 	   scene->camera->motion_position == Camera::MOTION_POSITION_CENTER)
 	{
-		sync_objects(b_v3d);
+		sync_objects();
 	}
 	sync_motion(b_render,
-	            b_v3d,
 	            b_override,
 	            width, height,
 	            python_thread_state);
@@ -225,9 +222,7 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render,
 
 void BlenderSync::sync_integrator()
 {
-#ifdef __CAMERA_MOTION__
 	BL::RenderSettings r = b_scene.render();
-#endif
 	PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
 	experimental = (get_enum(cscene, "feature_set") != 0);
@@ -235,7 +230,6 @@ void BlenderSync::sync_integrator()
 	Integrator *integrator = scene->integrator;
 	Integrator previntegrator = *integrator;
 
-	integrator->min_bounce = get_int(cscene, "min_bounces");
 	integrator->max_bounce = get_int(cscene, "max_bounces");
 
 	integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces");
@@ -244,8 +238,6 @@ void BlenderSync::sync_integrator()
 	integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
 	integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
-	integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces");
-	integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
 
 	integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
 	integrator->volume_step_size = get_float(cscene, "volume_step_size");
@@ -275,7 +267,6 @@ void BlenderSync::sync_integrator()
 
 	integrator->sample_clamp_direct = get_float(cscene, "sample_clamp_direct");
 	integrator->sample_clamp_indirect = get_float(cscene, "sample_clamp_indirect");
-#ifdef __CAMERA_MOTION__
 	if(!preview) {
 		if(integrator->motion_blur != r.use_motion_blur()) {
 			scene->object_manager->tag_update(scene);
@@ -284,7 +275,6 @@ void BlenderSync::sync_integrator()
 
 		integrator->motion_blur = r.use_motion_blur();
 	}
-#endif
 
 	integrator->method = (Integrator::Method)get_enum(cscene,
 	                                                  "progressive",
@@ -330,6 +320,9 @@ void BlenderSync::sync_integrator()
 			integrator->ao_bounces = get_int(cscene, "ao_bounces_render");
 		}
 	}
+	else {
+		integrator->ao_bounces = 0;
+	}
 
 	if(integrator->modified(previntegrator))
 		integrator->tag_update(scene);
@@ -480,11 +473,156 @@ void BlenderSync::sync_images()
 	}
 }
 
+/* Passes */
+PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+#define MAP_PASS(passname, passtype) if(name == passname) return passtype;
+	/* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+	MAP_PASS("Combined", PASS_COMBINED);
+	MAP_PASS("Depth", PASS_DEPTH);
+	MAP_PASS("Mist", PASS_MIST);
+	MAP_PASS("Normal", PASS_NORMAL);
+	MAP_PASS("IndexOB", PASS_OBJECT_ID);
+	MAP_PASS("UV", PASS_UV);
+	MAP_PASS("Vector", PASS_MOTION);
+	MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+
+	MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
+	MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
+	MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
+	MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
+	MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT);
+
+	MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
+	MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
+	MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
+	MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
+	MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT);
+
+	MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
+	MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
+	MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+	MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
+
+	MAP_PASS("Emit", PASS_EMISSION);
+	MAP_PASS("Env", PASS_BACKGROUND);
+	MAP_PASS("AO", PASS_AO);
+	MAP_PASS("Shadow", PASS_SHADOW);
+
+#ifdef __KERNEL_DEBUG__
+	MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES);
+	MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES);
+	MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS);
+	MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
+#endif
+	MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+#undef MAP_PASS
+
+	return PASS_NONE;
+}
+
+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+	if(name.substr(0, 10) != "Denoising ") {
+		return -1;
+	}
+	name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+	MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+	MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+	MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+	MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+	MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+	MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+	MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+	MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+	MAP_PASS("Image", DENOISING_PASS_COLOR);
+	MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+	return -1;
+}
+
+array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
+                                            BL::SceneRenderLayer& b_srlay,
+                                            const SessionParams &session_params)
+{
+	array<Pass> passes;
+	Pass::add(PASS_COMBINED, passes);
+
+	if(!session_params.device.advanced_shading) {
+		return passes;
+	}
+
+	/* loop over passes */
+	BL::RenderLayer::passes_iterator b_pass_iter;
+
+	for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
+		BL::RenderPass b_pass(*b_pass_iter);
+		PassType pass_type = get_pass_type(b_pass);
+
+		if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
+			continue;
+		if(pass_type != PASS_NONE)
+			Pass::add(pass_type, passes);
+	}
+
+	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+	if(get_boolean(crp, "denoising_store_passes") &&
+	   get_boolean(crp, "use_denoising"))
+	{
+		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image",           3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
+	}
+#ifdef __KERNEL_DEBUG__
+	if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+		b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+		b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_intersections")) {
+		b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_INTERSECTIONS, passes);
+	}
+	if(get_boolean(crp, "pass_debug_ray_bounces")) {
+		b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_RAY_BOUNCES, passes);
+	}
+#endif
+	if(get_boolean(crp, "pass_debug_render_time")) {
+		b_engine.add_pass("Debug Render Time", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_RENDER_TIME, passes);
+	}
+	if(get_boolean(crp, "use_pass_volume_direct")) {
+		b_engine.add_pass("VolumeDir", 3, "RGB", b_srlay.name().c_str());
+		Pass::add(PASS_VOLUME_DIRECT, passes);
+	}
+	if(get_boolean(crp, "use_pass_volume_indirect")) {
+		b_engine.add_pass("VolumeInd", 3, "RGB", b_srlay.name().c_str());
+		Pass::add(PASS_VOLUME_INDIRECT, passes);
+	}
+
+	return passes;
+}
+
 /* Scene Parameters */
 
 SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
-                                          bool background,
-                                          bool is_cpu)
+                                          bool background)
 {
 	BL::RenderSettings r = b_scene.render();
 	SceneParams params;
@@ -496,14 +634,10 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 	else if(shadingsystem == 1)
 		params.shadingsystem = SHADINGSYSTEM_OSL;
 	
-	if(background)
+	if(background || DebugFlags().viewport_static_bvh)
 		params.bvh_type = SceneParams::BVH_STATIC;
 	else
-		params.bvh_type = (SceneParams::BVHType)get_enum(
-		        cscene,
-		        "debug_bvh_type",
-		        SceneParams::BVH_NUM_TYPES,
-		        SceneParams::BVH_STATIC);
+		params.bvh_type = SceneParams::BVH_DYNAMIC;
 
 	params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
 	params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -528,15 +662,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 		params.texture_limit = 0;
 	}
 
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-	if(is_cpu) {
-		params.use_qbvh = DebugFlags().cpu.qbvh && system_cpu_support_sse2();
-	}
-	else
-#endif
-	{
-		params.use_qbvh = false;
-	}
+	params.bvh_layout = DebugFlags().cpu.bvh_layout;
 
 	return params;
 }
@@ -560,6 +686,16 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	/* feature set */
 	params.experimental = (get_enum(cscene, "feature_set") != 0);
 
+	/* threads */
+	BL::RenderSettings b_r = b_scene.render();
+	if(b_r.threads_mode() == BL::RenderSettings::threads_mode_FIXED)
+		params.threads = b_r.threads();
+	else
+		params.threads = 0;
+
+	/* Background */
+	params.background = background;
+
 	/* device type */
 	vector<DeviceInfo>& devices = Device::available_devices();
 	
@@ -588,12 +724,28 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 			}
 		}
 
-		int compute_device = get_enum(b_preferences, "compute_device_type");
+		enum ComputeDevice {
+			COMPUTE_DEVICE_CPU = 0,
+			COMPUTE_DEVICE_CUDA = 1,
+			COMPUTE_DEVICE_OPENCL = 2,
+			COMPUTE_DEVICE_NUM = 3,
+		};
 
-		if(compute_device != 0) {
+		ComputeDevice compute_device = (ComputeDevice)get_enum(b_preferences,
+		                                                       "compute_device_type",
+		                                                       COMPUTE_DEVICE_NUM,
+		                                                       COMPUTE_DEVICE_CPU);
+
+		if(compute_device != COMPUTE_DEVICE_CPU) {
 			vector<DeviceInfo> used_devices;
 			RNA_BEGIN(&b_preferences, device, "devices") {
-				if(get_enum(device, "type") == compute_device && get_boolean(device, "use")) {
+				ComputeDevice device_type = (ComputeDevice)get_enum(device,
+				                                                    "type",
+				                                                    COMPUTE_DEVICE_NUM,
+				                                                    COMPUTE_DEVICE_CPU);
+
+				if(get_boolean(device, "use") &&
+				   (device_type == compute_device || device_type == COMPUTE_DEVICE_CPU)) {
 					string id = get_string(device, "id");
 					foreach(DeviceInfo& info, devices) {
 						if(info.id == id) {
@@ -608,15 +760,14 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 				params.device = used_devices[0];
 			}
 			else if(used_devices.size() > 1) {
-				params.device = Device::get_multi_device(used_devices);
+				params.device = Device::get_multi_device(used_devices,
+				                                         params.threads,
+				                                         params.background);
 			}
 			/* Else keep using the CPU device that was set before. */
 		}
 	}
 
-	/* Background */
-	params.background = background;
-
 	/* samples */
 	int samples = get_int(cscene, "samples");
 	int aa_samples = get_int(cscene, "aa_samples");
@@ -676,19 +827,28 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 		params.tile_order = TILE_BOTTOM_TO_TOP;
 	}
 
+	/* other parameters */
 	params.start_resolution = get_int(cscene, "preview_start_resolution");
+	params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
 	/* other parameters */
-	if(b_scene.render().threads_mode() == BL::RenderSettings::threads_mode_FIXED)
-		params.threads = b_scene.render().threads();
-	else
-		params.threads = 0;
-
 	params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
 	params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
 	params.text_timeout = (double)get_float(cscene, "debug_text_timeout");
 
-	params.progressive_refine = get_boolean(cscene, "use_progressive_refine");
+	/* progressive refine */
+	params.progressive_refine = get_boolean(cscene, "use_progressive_refine") &&
+	                            !b_r.use_save_buffers();
+
+	if(params.progressive_refine) {
+		BL::RenderSettings::layers_iterator b_rlay;
+		for(b_r.layers.begin(b_rlay); b_rlay != b_r.layers.end(); ++b_rlay) {
+			PointerRNA crl = RNA_pointer_get(&b_rlay->ptr, "cycles");
+			if(get_boolean(crl, "use_denoising")) {
+				params.progressive_refine = false;
+			}
+		}
+	}
 
 	if(background) {
 		if(params.progressive_refine)
@@ -697,6 +857,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 			params.progressive = false;
 
 		params.start_resolution = INT_MAX;
+		params.pixel_size = 1;
 	}
 	else
 		params.progressive = true;
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 6984cbda259..1e7b0b32518 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -22,15 +22,15 @@
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
 
-#include "blender_util.h"
+#include "blender/blender_util.h"
 
-#include "scene.h"
-#include "session.h"
+#include "render/scene.h"
+#include "render/session.h"
 
-#include "util_map.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -54,8 +54,7 @@ public:
 	            BL::Scene& b_scene,
 	            Scene *scene,
 	            bool preview,
-	            Progress &progress,
-	            bool is_cpu);
+	            Progress &progress);
 	~BlenderSync();
 
 	/* sync */
@@ -67,6 +66,9 @@ public:
 	               void **python_thread_state,
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
+	array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
+	                               BL::SceneRenderLayer& b_srlay,
+	                               const SessionParams &session_params);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
@@ -80,8 +82,7 @@ public:
 
 	/* get parameters */
 	static SceneParams get_scene_params(BL::Scene& b_scene,
-	                                    bool background,
-	                                    bool is_cpu);
+	                                    bool background);
 	static SessionParams get_session_params(BL::RenderEngine& b_engine,
 	                                        BL::UserPreferences& b_userpref,
 	                                        BL::Scene& b_scene,
@@ -93,13 +94,15 @@ public:
 	                                      Camera *cam,
 	                                      int width, int height);
 
+	static PassType get_pass_type(BL::RenderPass& b_pass);
+	static int get_denoising_pass(BL::RenderPass& b_pass);
+
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
 	void sync_materials(bool update_all);
-	void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f);
+	void sync_objects(float motion_time = 0.0f);
 	void sync_motion(BL::RenderSettings& b_render,
-	                 BL::SpaceView3D& b_v3d,
 	                 BL::Object& b_override,
 	                 int width, int height,
 	                 void **python_thread_state);
@@ -115,7 +118,7 @@ private:
 	                 BL::Mesh& b_mesh,
 	                 BL::Object& b_ob,
 	                 bool motion,
-	                 int time_index = 0);
+	                 int motion_step = 0);
 	Object *sync_object(BL::Object& b_parent,
 	                    int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
 	                    BL::DupliObject& b_dupli_ob,
@@ -128,6 +131,7 @@ private:
 	void sync_light(BL::Object& b_parent,
 	                int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
 	                BL::Object& b_ob,
+	                BL::DupliObject& b_dupli_ob,
 	                Transform& tfm,
 	                bool *use_portal);
 	void sync_background_light(bool use_portal);
@@ -172,7 +176,6 @@ private:
 	Scene *scene;
 	bool preview;
 	bool experimental;
-	bool is_cpu;
 
 	float dicing_rate;
 	int max_subdivisions;
diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp
index 3807e683c7c..b2e27b76189 100644
--- a/intern/cycles/blender/blender_texture.cpp
+++ b/intern/cycles/blender/blender_texture.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "blender_texture.h"
+#include "blender/blender_texture.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h
index ad96f9db8ed..734231a85ec 100644
--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -18,7 +18,7 @@
 #define __BLENDER_TEXTURE_H__
 
 #include <stdlib.h>
-#include "blender_sync.h"
+#include "blender/blender_sync.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 23df3c1bc30..c418b19a637 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -17,15 +17,15 @@
 #ifndef __BLENDER_UTIL_H__
 #define __BLENDER_UTIL_H__
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "util_algorithm.h"
-#include "util_map.h"
-#include "util_path.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_map.h"
+#include "util/util_path.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 /* Hacks to hook into Blender API
  * todo: clean this up ... */
@@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                       bool calc_undeformed,
                                       Mesh::SubdivisionType subdivision_type)
 {
-	bool subsurf_mod_show_render;
-	bool subsurf_mod_show_viewport;
+	bool subsurf_mod_show_render = false;
+	bool subsurf_mod_show_viewport = false;
 
 	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
@@ -174,22 +174,19 @@ static inline void curvemapping_color_to_array(BL::CurveMapping& cumap,
 
 	if(rgb_curve) {
 		BL::CurveMap mapI = cumap.curves[3];
-
 		for(int i = 0; i < size; i++) {
-			float t = min_x + (float)i/(float)(size-1) * range_x;
-
-			data[i][0] = mapR.evaluate(mapI.evaluate(t));
-			data[i][1] = mapG.evaluate(mapI.evaluate(t));
-			data[i][2] = mapB.evaluate(mapI.evaluate(t));
+			const float t = min_x + (float)i/(float)(size-1) * range_x;
+			data[i] = make_float3(mapR.evaluate(mapI.evaluate(t)),
+			                      mapG.evaluate(mapI.evaluate(t)),
+			                      mapB.evaluate(mapI.evaluate(t)));
 		}
 	}
 	else {
 		for(int i = 0; i < size; i++) {
 			float t = min_x + (float)i/(float)(size-1) * range_x;
-
-			data[i][0] = mapR.evaluate(t);
-			data[i][1] = mapG.evaluate(t);
-			data[i][2] = mapB.evaluate(t);
+			data[i] = make_float3(mapR.evaluate(t),
+			                      mapG.evaluate(t),
+			                      mapB.evaluate(t));
 		}
 	}
 }
@@ -250,14 +247,15 @@ static inline float *image_get_float_pixels_for_frame(BL::Image& image,
 
 static inline Transform get_transform(const BL::Array<float, 16>& array)
 {
-	Transform tfm;
+	ProjectionTransform projection;
 
-	/* we assume both types to be just 16 floats, and transpose because blender
-	 * use column major matrix order while we use row major */
-	memcpy(&tfm, &array, sizeof(float)*16);
-	tfm = transform_transpose(tfm);
+	/* We assume both types to be just 16 floats, and transpose because blender
+	 * use column major matrix order while we use row major. */
+	memcpy(&projection, &array, sizeof(float)*16);
+	projection = projection_transpose(projection);
 
-	return tfm;
+	/* Drop last row, matrix is assumed to be affine transform. */
+	return projection_to_transform(projection);
 }
 
 static inline float2 get_float2(const BL::Array<float, 2>& array)
@@ -302,7 +300,7 @@ static inline uint get_layer(const BL::Array<int, 20>& array)
 	for(uint i = 0; i < 20; i++)
 		if(array[i])
 			layer |= (1 << i);
-	
+
 	return layer;
 }
 
@@ -437,7 +435,7 @@ static inline string get_string(PointerRNA& ptr, const char *name)
 	string str(cstr);
 	if(cstr != cstrbuf)
 		MEM_freeN(cstr);
-	
+
 	return str;
 }
 
@@ -454,7 +452,7 @@ static inline string blender_absolute_path(BL::BlendData& b_data,
 {
 	if(path.size() >= 2 && path[0] == '/' && path[1] == '/') {
 		string dirname;
-		
+
 		if(b_id.library()) {
 			BL::ID b_library_id(b_id.library());
 			dirname = blender_absolute_path(b_data,
@@ -486,33 +484,34 @@ static inline void mesh_texture_space(BL::Mesh& b_mesh,
 	loc = loc*size - make_float3(0.5f, 0.5f, 0.5f);
 }
 
-/* object used for motion blur */
-static inline bool object_use_motion(BL::Object& b_parent, BL::Object& b_ob)
+/* Object motion steps, returns 0 if no motion blur needed. */
+static inline uint object_motion_steps(BL::Object& b_parent, BL::Object& b_ob)
 {
+	/* Get motion enabled and steps from object itself. */
 	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
 	bool use_motion = get_boolean(cobject, "use_motion_blur");
-	/* If motion blur is enabled for the object we also check
-	 * whether it's enabled for the parent object as well.
-	 *
-	 * This way we can control motion blur from the dupligroup
-	 * duplicator much easier.
-	 */
-	if(use_motion && b_parent.ptr.data != b_ob.ptr.data) {
+	if(!use_motion) {
+		return 0;
+	}
+
+	uint steps = max(1, get_int(cobject, "motion_steps"));
+
+	/* Also check parent object, so motion blur and steps can be
+	 * controlled by dupligroup duplicator for linked groups. */
+	if(b_parent.ptr.data != b_ob.ptr.data) {
 		PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles");
 		use_motion &= get_boolean(parent_cobject, "use_motion_blur");
-	}
-	return use_motion;
-}
 
-/* object motion steps */
-static inline uint object_motion_steps(BL::Object& b_ob)
-{
-	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
-	uint steps = get_int(cobject, "motion_steps");
+		if(!use_motion) {
+			return 0;
+		}
+
+		steps = max(steps, get_int(parent_cobject, "motion_steps"));
+	}
 
-	/* use uneven number of steps so we get one keyframe at the current frame,
-	 * and ue 2^(steps - 1) so objects with more/fewer steps still have samples
-	 * at the same times, to avoid sampling at many different times */
+	/* Use uneven number of steps so we get one keyframe at the current frame,
+	 * and use 2^(steps - 1) so objects with more/fewer steps still have samples
+	 * at the same times, to avoid sampling at many different times. */
 	return (2 << (steps - 1)) + 1;
 }
 
@@ -547,7 +546,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object& b_ob)
 				return b_smd.domain_settings();
 		}
 	}
-	
+
 	return BL::SmokeDomainSettings(PointerRNA_NULL);
 }
 
@@ -819,4 +818,3 @@ protected:
 CCL_NAMESPACE_END
 
 #endif /* __BLENDER_UTIL_H__ */
-
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 92e48f0d87f..b8171e7f70d 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
-	../device
+	..
 )
 
 set(INC_SYS
@@ -14,6 +8,8 @@ set(INC_SYS
 
 set(SRC
 	bvh.cpp
+	bvh2.cpp
+	bvh4.cpp
 	bvh_binning.cpp
 	bvh_build.cpp
 	bvh_node.cpp
@@ -24,6 +20,8 @@ set(SRC
 
 set(SRC_HEADERS
 	bvh.h
+	bvh2.h
+	bvh4.h
 	bvh_binning.h
 	bvh_build.h
 	bvh_node.h
@@ -36,4 +34,4 @@ set(SRC_HEADERS
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_bvh ${SRC} ${SRC_HEADERS})
+cycles_add_library(cycles_bvh ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 1fb2f371a0f..b524ca07d8d 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,45 +15,68 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_math.h"
+#include "bvh/bvh.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh2.h"
+#include "bvh/bvh4.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Pack Utility */
+/* BVH Parameters. */
 
-struct BVHStackEntry
+const char *bvh_layout_name(BVHLayout layout)
 {
-	const BVHNode *node;
-	int idx;
+	switch(layout) {
+		case BVH_LAYOUT_BVH2: return "BVH2";
+		case BVH_LAYOUT_BVH4: return "BVH4";
+		case BVH_LAYOUT_NONE: return "NONE";
+		case BVH_LAYOUT_ALL:  return "ALL";
+	}
+	LOG(DFATAL) << "Unsupported BVH layout was passed.";
+	return "";
+}
 
-	BVHStackEntry(const BVHNode* n = 0, int i = 0)
-	: node(n), idx(i)
-	{
+BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout,
+                                     BVHLayoutMask supported_layouts)
+{
+	const BVHLayoutMask requested_layout_mask = (BVHLayoutMask)requested_layout;
+	/* Check whether requested layout is supported, if so -- no need to do
+	 * any extra computation.
+	 */
+	if(supported_layouts & requested_layout_mask) {
+		return requested_layout;
 	}
+	/* Some bit magic to get widest supported BVH layout. */
+	/* This is a mask of supported BVH layouts which are narrower than the
+	 * requested one.
+	 */
+	const BVHLayoutMask allowed_layouts_mask =
+	        (supported_layouts & (requested_layout_mask - 1));
+	/* We get widest from allowed ones and convert mask to actual layout. */
+	const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask);
+	return (BVHLayout)(1 << widest_allowed_layout_mask);
+}
 
-	int encodeIdx() const
-	{
-		return (node->is_leaf())? ~idx: idx;
-	}
-};
+/* Pack Utility */
+
+BVHStackEntry::BVHStackEntry(const BVHNode *n, int i)
+    : node(n), idx(i)
+{
+}
+
+int BVHStackEntry::encodeIdx() const
+{
+	return (node->is_leaf())? ~idx: idx;
+}
 
 /* BVH */
 
@@ -64,10 +87,17 @@ BVH::BVH(const BVHParams& params_, const vector<Object*>& objects_)
 
 BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 {
-	if(params.use_qbvh)
-		return new QBVH(params, objects);
-	else
-		return new RegularBVH(params, objects);
+	switch(params.bvh_layout) {
+		case BVH_LAYOUT_BVH2:
+			return new BVH2(params, objects);
+		case BVH_LAYOUT_BVH4:
+			return new BVH4(params, objects);
+		case BVH_LAYOUT_NONE:
+		case BVH_LAYOUT_ALL:
+			break;
+	}
+	LOG(DFATAL) << "Requested unsupported BVH layout.";
+	return NULL;
 }
 
 /* Building */
@@ -121,6 +151,73 @@ void BVH::refit(Progress& progress)
 	refit_nodes();
 }
 
+void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
+{
+	/* Refit range of primitives. */
+	for(int prim = start; prim < end; prim++) {
+		int pidx = pack.prim_index[prim];
+		int tob = pack.prim_object[prim];
+		Object *ob = objects[tob];
+
+		if(pidx == -1) {
+			/* Object instance. */
+			bbox.grow(ob->bounds);
+		}
+		else {
+			/* Primitives. */
+			const Mesh *mesh = ob->mesh;
+
+			if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+				/* Curves. */
+				int str_offset = (params.top_level)? mesh->curve_offset: 0;
+				Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+				int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+				curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+				visibility |= PATH_RAY_CURVE;
+
+				/* Motion curves. */
+				if(mesh->use_motion_blur) {
+					Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+					if(attr) {
+						size_t mesh_size = mesh->curve_keys.size();
+						size_t steps = mesh->motion_steps - 1;
+						float3 *key_steps = attr->data_float3();
+
+						for(size_t i = 0; i < steps; i++)
+							curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+					}
+				}
+			}
+			else {
+				/* Triangles. */
+				int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+				Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+				const float3 *vpos = &mesh->verts[0];
+
+				triangle.bounds_grow(vpos, bbox);
+
+				/* Motion triangles. */
+				if(mesh->use_motion_blur) {
+					Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+					if(attr) {
+						size_t mesh_size = mesh->verts.size();
+						size_t steps = mesh->motion_steps - 1;
+						float3 *vert_steps = attr->data_float3();
+
+						for(size_t i = 0; i < steps; i++)
+							triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+					}
+				}
+			}
+		}
+		visibility |= ob->visibility_for_tracing();
+	}
+}
+
 /* Triangles */
 
 void BVH::pack_triangle(int idx, float4 tri_verts[3])
@@ -166,7 +263,6 @@ void BVH::pack_primitives()
 		if(pack.prim_index[i] != -1) {
 			int tob = pack.prim_object[i];
 			Object *ob = objects[tob];
-
 			if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
 				pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]);
 				pack.prim_tri_index[i] = 3 * prim_triangle_index;
@@ -175,11 +271,10 @@ void BVH::pack_primitives()
 			else {
 				pack.prim_tri_index[i] = -1;
 			}
-
-			pack.prim_visibility[i] = ob->visibility;
-
-			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
+			pack.prim_visibility[i] = ob->visibility_for_tracing();
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
 				pack.prim_visibility[i] |= PATH_RAY_CURVE;
+			}
 		}
 		else {
 			pack.prim_tri_index[i] = -1;
@@ -196,7 +291,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	 * BVH's are stored in global arrays. This function merges them into the
 	 * top level BVH, adjusting indexes and offsets where appropriate.
 	 */
-	const bool use_qbvh = params.use_qbvh;
+	/* TODO(sergey): This code needs adjustment for wider BVH than 4. */
+	const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
 
 	/* Adjust primitive index to point to the triangle in the global array, for
 	 * meshes with transform applied and already in the top level BVH.
@@ -418,832 +514,4 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	}
 }
 
-/* Regular BVH */
-
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	return node0->is_unaligned() || node1->is_unaligned();
-}
-
-RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
-}
-
-void RegularBVH::pack_leaf(const BVHStackEntry& e,
-                           const LeafNode *leaf)
-{
-	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
-	float4 data[BVH_NODE_LEAF_SIZE];
-	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
-		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
-		data[0].y = __int_as_float(0);
-	}
-	else {
-		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
-	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
-	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
-	}
-
-	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
-}
-
-void RegularBVH::pack_inner(const BVHStackEntry& e,
-                            const BVHStackEntry& e0,
-                            const BVHStackEntry& e1)
-{
-	if(e0.node->is_unaligned() || e1.node->is_unaligned()) {
-		pack_unaligned_inner(e, e0, e1);
-	} else {
-		pack_aligned_inner(e, e0, e1);
-	}
-}
-
-void RegularBVH::pack_aligned_inner(const BVHStackEntry& e,
-                                    const BVHStackEntry& e0,
-                                    const BVHStackEntry& e1)
-{
-	pack_aligned_node(e.idx,
-	                  e0.node->m_bounds, e1.node->m_bounds,
-	                  e0.encodeIdx(), e1.encodeIdx(),
-	                  e0.node->m_visibility, e1.node->m_visibility);
-}
-
-void RegularBVH::pack_aligned_node(int idx,
-                                   const BoundBox& b0,
-                                   const BoundBox& b1,
-                                   int c0, int c1,
-                                   uint visibility0, uint visibility1)
-{
-	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
-	assert(c0 < 0 || c0 < pack.nodes.size());
-	assert(c1 < 0 || c1 < pack.nodes.size());
-
-	int4 data[BVH_NODE_SIZE] = {
-		make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
-		          visibility1 & ~PATH_RAY_NODE_UNALIGNED,
-		          c0, c1),
-		make_int4(__float_as_int(b0.min.x),
-		          __float_as_int(b1.min.x),
-		          __float_as_int(b0.max.x),
-		          __float_as_int(b1.max.x)),
-		make_int4(__float_as_int(b0.min.y),
-		          __float_as_int(b1.min.y),
-		          __float_as_int(b0.max.y),
-		          __float_as_int(b1.max.y)),
-		make_int4(__float_as_int(b0.min.z),
-		          __float_as_int(b1.min.z),
-		          __float_as_int(b0.max.z),
-		          __float_as_int(b1.max.z)),
-	};
-
-	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
-}
-
-void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                      const BVHStackEntry& e0,
-                                      const BVHStackEntry& e1)
-{
-	pack_unaligned_node(e.idx,
-	                    e0.node->get_aligned_space(),
-	                    e1.node->get_aligned_space(),
-	                    e0.node->m_bounds,
-	                    e1.node->m_bounds,
-	                    e0.encodeIdx(), e1.encodeIdx(),
-	                    e0.node->m_visibility, e1.node->m_visibility);
-}
-
-void RegularBVH::pack_unaligned_node(int idx,
-                                     const Transform& aligned_space0,
-                                     const Transform& aligned_space1,
-                                     const BoundBox& bounds0,
-                                     const BoundBox& bounds1,
-                                     int c0, int c1,
-                                     uint visibility0, uint visibility1)
-{
-	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
-	assert(c0 < 0 || c0 < pack.nodes.size());
-	assert(c1 < 0 || c1 < pack.nodes.size());
-
-	float4 data[BVH_UNALIGNED_NODE_SIZE];
-	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
-	                                                        aligned_space0);
-	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
-	                                                        aligned_space1);
-	data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
-	                      __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
-	                      __int_as_float(c0),
-	                      __int_as_float(c1));
-
-	data[1] = space0.x;
-	data[2] = space0.y;
-	data[3] = space0.z;
-	data[4] = space1.x;
-	data[5] = space1.y;
-	data[6] = space1.z;
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
-}
-
-void RegularBVH::pack_nodes(const BVHNode *root)
-{
-	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	assert(num_leaf_nodes <= num_nodes);
-	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-	size_t node_size;
-	if(params.use_unaligned_nodes) {
-		const size_t num_unaligned_nodes =
-		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-		node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
-		            (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
-	}
-	else {
-		node_size = num_inner_nodes * BVH_NODE_SIZE;
-	}
-	/* Resize arrays */
-	pack.nodes.clear();
-	pack.leaf_nodes.clear();
-	/* For top level BVH, first merge existing BVH's so we know the offsets. */
-	if(params.top_level) {
-		pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
-	}
-	else {
-		pack.nodes.resize(node_size);
-		pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
-	}
-
-	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-	vector<BVHStackEntry> stack;
-	stack.reserve(BVHParams::MAX_DEPTH*2);
-	if(root->is_leaf()) {
-		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-	}
-	else {
-		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_bvh_is_unaligned(root)
-		                       ? BVH_UNALIGNED_NODE_SIZE
-		                       : BVH_NODE_SIZE;
-	}
-
-	while(stack.size()) {
-		BVHStackEntry e = stack.back();
-		stack.pop_back();
-
-		if(e.node->is_leaf()) {
-			/* leaf node */
-			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
-			pack_leaf(e, leaf);
-		}
-		else {
-			/* innner node */
-			int idx[2];
-			for(int i = 0; i < 2; ++i) {
-				if(e.node->get_child(i)->is_leaf()) {
-					idx[i] = nextLeafNodeIdx++;
-				}
-				else {
-					idx[i] = nextNodeIdx;
-					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
-					                       ? BVH_UNALIGNED_NODE_SIZE
-					                       : BVH_NODE_SIZE;
-				}
-			}
-
-			stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
-			stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
-
-			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
-		}
-	}
-	assert(node_size == nextNodeIdx);
-	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void RegularBVH::refit_nodes()
-{
-	assert(!params.top_level);
-
-	BoundBox bbox = BoundBox::empty;
-	uint visibility = 0;
-	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
-	if(leaf) {
-		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
-		const int4 *data = &pack.leaf_nodes[idx];
-		const int c0 = data[0].x;
-		const int c1 = data[0].y;
-		/* refit leaf node */
-		for(int prim = c0; prim < c1; prim++) {
-			int pidx = pack.prim_index[prim];
-			int tob = pack.prim_object[prim];
-			Object *ob = objects[tob];
-
-			if(pidx == -1) {
-				/* object instance */
-				bbox.grow(ob->bounds);
-			}
-			else {
-				/* primitives */
-				const Mesh *mesh = ob->mesh;
-
-				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-					/* curves */
-					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
-					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-					visibility |= PATH_RAY_CURVE;
-
-					/* motion curves */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->curve_keys.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *key_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
-						}
-					}
-				}
-				else {
-					/* triangles */
-					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
-					const float3 *vpos = &mesh->verts[0];
-
-					triangle.bounds_grow(vpos, bbox);
-
-					/* motion triangles */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->verts.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *vert_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
-						}
-					}
-				}
-			}
-
-			visibility |= ob->visibility;
-		}
-
-		/* TODO(sergey): De-duplicate with pack_leaf(). */
-		float4 leaf_data[BVH_NODE_LEAF_SIZE];
-		leaf_data[0].x = __int_as_float(c0);
-		leaf_data[0].y = __int_as_float(c1);
-		leaf_data[0].z = __uint_as_float(visibility);
-		leaf_data[0].w = __uint_as_float(data[0].w);
-		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
-	}
-	else {
-		assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
-
-		const int4 *data = &pack.nodes[idx];
-		const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-		const int c0 = data[0].z;
-		const int c1 = data[0].w;
-		/* refit inner node, set bbox from children */
-		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
-		uint visibility0 = 0, visibility1 = 0;
-
-		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
-		refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
-
-		if(is_unaligned) {
-			Transform aligned_space = transform_identity();
-			pack_unaligned_node(idx,
-			                    aligned_space, aligned_space,
-			                    bbox0, bbox1,
-			                    c0, c1,
-			                    visibility0,
-			                    visibility1);
-		}
-		else {
-			pack_aligned_node(idx,
-			                  bbox0, bbox1,
-			                  c0, c1,
-			                  visibility0,
-			                  visibility1);
-		}
-
-		bbox.grow(bbox0);
-		bbox.grow(bbox1);
-		visibility = visibility0|visibility1;
-	}
-}
-
-/* QBVH */
-
-/* Can we avoid this somehow or make more generic?
- *
- * Perhaps we can merge nodes in actual tree and make our
- * life easier all over the place.
- */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	bool has_unaligned = false;
-	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned();
-	}
-	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned();
-		has_unaligned |= node0->get_child(1)->is_unaligned();
-	}
-	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned();
-	}
-	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned();
-		has_unaligned |= node1->get_child(1)->is_unaligned();
-	}
-	return has_unaligned;
-}
-
-QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
-	params.use_qbvh = true;
-}
-
-void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
-{
-	float4 data[BVH_QNODE_LEAF_SIZE];
-	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
-		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
-		data[0].y = __int_as_float(0);
-	}
-	else {
-		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
-	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
-	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
-	}
-
-	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
-}
-
-void QBVH::pack_inner(const BVHStackEntry& e,
-                      const BVHStackEntry *en,
-                      int num)
-{
-	bool has_unaligned = false;
-	/* Check whether we have to create unaligned node or all nodes are aligned
-	 * and we can cut some corner here.
-	 */
-	if(params.use_unaligned_nodes) {
-		for(int i = 0; i < num; i++) {
-			if(en[i].node->is_unaligned()) {
-				has_unaligned = true;
-				break;
-			}
-		}
-	}
-	if(has_unaligned) {
-		/* There's no unaligned children, pack into AABB node. */
-		pack_unaligned_inner(e, en, num);
-	}
-	else {
-		/* Create unaligned node with orientation transform for each of the
-		 * children.
-		 */
-		pack_aligned_inner(e, en, num);
-	}
-}
-
-void QBVH::pack_aligned_inner(const BVHStackEntry& e,
-                              const BVHStackEntry *en,
-                              int num)
-{
-	BoundBox bounds[4];
-	int child[4];
-	for(int i = 0; i < num; ++i) {
-		bounds[i] = en[i].node->m_bounds;
-		child[i] = en[i].encodeIdx();
-	}
-	pack_aligned_node(e.idx,
-	                  bounds,
-	                  child,
-	                  e.node->m_visibility,
-	                  e.node->m_time_from,
-	                  e.node->m_time_to,
-	                  num);
-}
-
-void QBVH::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-	float4 data[BVH_QNODE_SIZE];
-	memset(data, 0, sizeof(data));
-
-	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;
-
-	for(int i = 0; i < num; i++) {
-		float3 bb_min = bounds[i].min;
-		float3 bb_max = bounds[i].max;
-
-		data[1][i] = bb_min.x;
-		data[2][i] = bb_max.x;
-		data[3][i] = bb_min.y;
-		data[4][i] = bb_max.y;
-		data[5][i] = bb_min.z;
-		data[6][i] = bb_max.z;
-
-		data[7][i] = __int_as_float(child[i]);
-	}
-
-	for(int i = num; i < 4; i++) {
-		/* We store BB which would never be recorded as intersection
-		 * so kernel might safely assume there are always 4 child nodes.
-		 */
-		data[1][i] = FLT_MAX;
-		data[2][i] = -FLT_MAX;
-
-		data[3][i] = FLT_MAX;
-		data[4][i] = -FLT_MAX;
-
-		data[5][i] = FLT_MAX;
-		data[6][i] = -FLT_MAX;
-
-		data[7][i] = __int_as_float(0);
-	}
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
-}
-
-void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                const BVHStackEntry *en,
-                                int num)
-{
-	Transform aligned_space[4];
-	BoundBox bounds[4];
-	int child[4];
-	for(int i = 0; i < num; ++i) {
-		aligned_space[i] = en[i].node->get_aligned_space();
-		bounds[i] = en[i].node->m_bounds;
-		child[i] = en[i].encodeIdx();
-	}
-	pack_unaligned_node(e.idx,
-	                    aligned_space,
-	                    bounds,
-	                    child,
-	                    e.node->m_visibility,
-	                    e.node->m_time_from,
-	                    e.node->m_time_to,
-	                    num);
-}
-
-void QBVH::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-	float4 data[BVH_UNALIGNED_QNODE_SIZE];
-	memset(data, 0, sizeof(data));
-
-	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;
-
-	for(int i = 0; i < num; i++) {
-		Transform space = BVHUnaligned::compute_node_transform(
-		        bounds[i],
-		        aligned_space[i]);
-
-		data[1][i] = space.x.x;
-		data[2][i] = space.x.y;
-		data[3][i] = space.x.z;
-
-		data[4][i] = space.y.x;
-		data[5][i] = space.y.y;
-		data[6][i] = space.y.z;
-
-		data[7][i] = space.z.x;
-		data[8][i] = space.z.y;
-		data[9][i] = space.z.z;
-
-		data[10][i] = space.x.w;
-		data[11][i] = space.y.w;
-		data[12][i] = space.z.w;
-
-		data[13][i] = __int_as_float(child[i]);
-	}
-
-	for(int i = num; i < 4; i++) {
-		/* We store BB which would never be recorded as intersection
-		 * so kernel might safely assume there are always 4 child nodes.
-		 */
-
-		data[1][i] = 1.0f;
-		data[2][i] = 0.0f;
-		data[3][i] = 0.0f;
-
-		data[4][i] = 0.0f;
-		data[5][i] = 0.0f;
-		data[6][i] = 0.0f;
-
-		data[7][i] = 0.0f;
-		data[8][i] = 0.0f;
-		data[9][i] = 0.0f;
-
-		data[10][i] = -FLT_MAX;
-		data[11][i] = -FLT_MAX;
-		data[12][i] = -FLT_MAX;
-
-		data[13][i] = __int_as_float(0);
-	}
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void QBVH::pack_nodes(const BVHNode *root)
-{
-	/* Calculate size of the arrays required. */
-	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
-	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	assert(num_leaf_nodes <= num_nodes);
-	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-	size_t node_size;
-	if(params.use_unaligned_nodes) {
-		const size_t num_unaligned_nodes =
-		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
-		node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
-		            (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
-	}
-	else {
-		node_size = num_inner_nodes * BVH_QNODE_SIZE;
-	}
-	/* Resize arrays. */
-	pack.nodes.clear();
-	pack.leaf_nodes.clear();
-	/* For top level BVH, first merge existing BVH's so we know the offsets. */
-	if(params.top_level) {
-		pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
-	}
-	else {
-		pack.nodes.resize(node_size);
-		pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
-	}
-
-	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-	vector<BVHStackEntry> stack;
-	stack.reserve(BVHParams::MAX_DEPTH*2);
-	if(root->is_leaf()) {
-		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-	}
-	else {
-		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_qbvh_is_unaligned(root)
-		                       ? BVH_UNALIGNED_QNODE_SIZE
-		                       : BVH_QNODE_SIZE;
-	}
-
-	while(stack.size()) {
-		BVHStackEntry e = stack.back();
-		stack.pop_back();
-
-		if(e.node->is_leaf()) {
-			/* leaf node */
-			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
-			pack_leaf(e, leaf);
-		}
-		else {
-			/* Inner node. */
-			const BVHNode *node = e.node;
-			const BVHNode *node0 = node->get_child(0);
-			const BVHNode *node1 = node->get_child(1);
-			/* Collect nodes. */
-			const BVHNode *nodes[4];
-			int numnodes = 0;
-			if(node0->is_leaf()) {
-				nodes[numnodes++] = node0;
-			}
-			else {
-				nodes[numnodes++] = node0->get_child(0);
-				nodes[numnodes++] = node0->get_child(1);
-			}
-			if(node1->is_leaf()) {
-				nodes[numnodes++] = node1;
-			}
-			else {
-				nodes[numnodes++] = node1->get_child(0);
-				nodes[numnodes++] = node1->get_child(1);
-			}
-			/* Push entries on the stack. */
-			for(int i = 0; i < numnodes; ++i) {
-				int idx;
-				if(nodes[i]->is_leaf()) {
-					idx = nextLeafNodeIdx++;
-				}
-				else {
-					idx = nextNodeIdx;
-					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
-					                       ? BVH_UNALIGNED_QNODE_SIZE
-					                       : BVH_QNODE_SIZE;
-				}
-				stack.push_back(BVHStackEntry(nodes[i], idx));
-			}
-			/* Set node. */
-			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
-		}
-	}
-	assert(node_size == nextNodeIdx);
-	/* Root index to start traversal at, to handle case of single leaf node. */
-	pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void QBVH::refit_nodes()
-{
-	assert(!params.top_level);
-
-	BoundBox bbox = BoundBox::empty;
-	uint visibility = 0;
-	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
-	if(leaf) {
-		int4 *data = &pack.leaf_nodes[idx];
-		int4 c = data[0];
-		/* Refit leaf node. */
-		for(int prim = c.x; prim < c.y; prim++) {
-			int pidx = pack.prim_index[prim];
-			int tob = pack.prim_object[prim];
-			Object *ob = objects[tob];
-
-			if(pidx == -1) {
-				/* Object instance. */
-				bbox.grow(ob->bounds);
-			}
-			else {
-				/* Primitives. */
-				const Mesh *mesh = ob->mesh;
-
-				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-					/* Curves. */
-					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
-					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-					visibility |= PATH_RAY_CURVE;
-
-					/* Motion curves. */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->curve_keys.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *key_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
-						}
-					}
-				}
-				else {
-					/* Triangles. */
-					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
-					const float3 *vpos = &mesh->verts[0];
-
-					triangle.bounds_grow(vpos, bbox);
-
-					/* Motion triangles. */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->verts.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *vert_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
-						}
-					}
-				}
-			}
-
-			visibility |= ob->visibility;
-		}
-
-		/* TODO(sergey): This is actually a copy of pack_leaf(),
-		 * but this chunk of code only knows actual data and has
-		 * no idea about BVHNode.
-		 *
-		 * Would be nice to de-duplicate code, but trying to make
-		 * making code more general ends up in much nastier code
-		 * in my opinion so far.
-		 *
-		 * Same applies to the inner nodes case below.
-		 */
-		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
-		leaf_data[0].x = __int_as_float(c.x);
-		leaf_data[0].y = __int_as_float(c.y);
-		leaf_data[0].z = __uint_as_float(visibility);
-		leaf_data[0].w = __uint_as_float(c.w);
-		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
-	}
-	else {
-		int4 *data = &pack.nodes[idx];
-		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-		int4 c;
-		if(is_unaligned) {
-			c = data[13];
-		}
-		else {
-			c = data[7];
-		}
-		/* Refit inner node, set bbox from children. */
-		BoundBox child_bbox[4] = {BoundBox::empty,
-		                          BoundBox::empty,
-		                          BoundBox::empty,
-		                          BoundBox::empty};
-		uint child_visibility[4] = {0};
-		int num_nodes = 0;
-
-		for(int i = 0; i < 4; ++i) {
-			if(c[i] != 0) {
-				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
-				           child_bbox[i], child_visibility[i]);
-				++num_nodes;
-				bbox.grow(child_bbox[i]);
-				visibility |= child_visibility[i];
-			}
-		}
-
-		if(is_unaligned) {
-			Transform aligned_space[4] = {transform_identity(),
-			                              transform_identity(),
-			                              transform_identity(),
-			                              transform_identity()};
-			pack_unaligned_node(idx,
-			                    aligned_space,
-			                    child_bbox,
-			                    &c[0],
-			                    visibility,
-			                    0.0f,
-			                    1.0f,
-			                    4);
-		}
-		else {
-			pack_aligned_node(idx,
-			                  child_bbox,
-			                  &c[0],
-			                  visibility,
-			                  0.0f,
-			                  1.0f,
-			                  4);
-		}
-	}
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 08f41fc736f..6a82f915692 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_H__
 #define __BVH_H__
 
-#include "bvh_params.h"
+#include "bvh/bvh_params.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -33,15 +33,8 @@ class LeafNode;
 class Object;
 class Progress;
 
-#define BVH_NODE_SIZE	4
-#define BVH_NODE_LEAF_SIZE	1
-#define BVH_QNODE_SIZE	8
-#define BVH_QNODE_LEAF_SIZE	1
-#define BVH_ALIGN		4096
-#define TRI_NODE_SIZE	3
-
-#define BVH_UNALIGNED_NODE_SIZE 7
-#define BVH_UNALIGNED_QNODE_SIZE 14
+#define BVH_ALIGN     4096
+#define TRI_NODE_SIZE 3
 
 /* Packed BVH
  *
@@ -54,7 +47,7 @@ struct PackedBVH {
 	/* BVH leaf nodes storage. */
 	array<int4> leaf_nodes;
 	/* object index to BVH node index mapping for instances */
-	array<int> object_node; 
+	array<int> object_node;
 	/* Mapping from primitive index to index in triangle array. */
 	array<uint> prim_tri_index;
 	/* Continuous storage of triangle vertices. */
@@ -98,6 +91,9 @@ public:
 protected:
 	BVH(const BVHParams& params, const vector<Object*>& objects);
 
+	/* Refit range of primitives. */
+	void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
+
 	/* triangles and strands */
 	void pack_primitives();
 	void pack_triangle(int idx, float4 storage[3]);
@@ -110,95 +106,16 @@ protected:
 	virtual void refit_nodes() = 0;
 };
 
-/* Regular BVH
- *
- * Typical BVH with each node having two children. */
-
-class RegularBVH : public BVH {
-protected:
-	/* constructor */
-	friend class BVH;
-	RegularBVH(const BVHParams& params, const vector<Object*>& objects);
-
-	/* pack */
-	void pack_nodes(const BVHNode *root);
-
-	void pack_leaf(const BVHStackEntry& e,
-	               const LeafNode *leaf);
-	void pack_inner(const BVHStackEntry& e,
-	                const BVHStackEntry& e0,
-	                const BVHStackEntry& e1);
-
-	void pack_aligned_inner(const BVHStackEntry& e,
-	                        const BVHStackEntry& e0,
-	                        const BVHStackEntry& e1);
-	void pack_aligned_node(int idx,
-	                       const BoundBox& b0,
-	                       const BoundBox& b1,
-	                       int c0, int c1,
-	                       uint visibility0, uint visibility1);
-
-	void pack_unaligned_inner(const BVHStackEntry& e,
-	                          const BVHStackEntry& e0,
-	                          const BVHStackEntry& e1);
-	void pack_unaligned_node(int idx,
-	                         const Transform& aligned_space0,
-	                         const Transform& aligned_space1,
-	                         const BoundBox& b0,
-	                         const BoundBox& b1,
-	                         int c0, int c1,
-	                         uint visibility0, uint visibility1);
-
-	/* refit */
-	void refit_nodes();
-	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
-};
-
-/* QBVH
- *
- * Quad BVH, with each node having four children, to use with SIMD instructions. */
+/* Pack Utility */
+struct BVHStackEntry
+{
+	const BVHNode *node;
+	int idx;
 
-class QBVH : public BVH {
-protected:
-	/* constructor */
-	friend class BVH;
-	QBVH(const BVHParams& params, const vector<Object*>& objects);
-
-	/* pack */
-	void pack_nodes(const BVHNode *root);
-
-	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
-	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
-
-	void pack_aligned_inner(const BVHStackEntry& e,
-	                        const BVHStackEntry *en,
-	                        int num);
-	void pack_aligned_node(int idx,
-	                       const BoundBox *bounds,
-	                       const int *child,
-	                       const uint visibility,
-	                       const float time_from,
-	                       const float time_to,
-	                       const int num);
-
-	void pack_unaligned_inner(const BVHStackEntry& e,
-	                          const BVHStackEntry *en,
-	                          int num);
-	void pack_unaligned_node(int idx,
-	                         const Transform *aligned_space,
-	                         const BoundBox *bounds,
-	                         const int *child,
-	                         const uint visibility,
-	                         const float time_from,
-	                         const float time_to,
-	                         const int num);
-
-	/* refit */
-	void refit_nodes();
-	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+	BVHStackEntry(const BVHNode *n = 0, int i = 0);
+	int encodeIdx() const;
 };
 
 CCL_NAMESPACE_END
 
 #endif /* __BVH_H__ */
-
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
new file mode 100644
index 00000000000..9d89d2b6afb
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -0,0 +1,303 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh2.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool node_bvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	return node0->is_unaligned || node1->is_unaligned;
+}
+
+BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH2::pack_leaf(const BVHStackEntry& e,
+                     const LeafNode *leaf)
+{
+	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+	float4 data[BVH_NODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+}
+
+void BVH2::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry& e0,
+                      const BVHStackEntry& e1)
+{
+	if(e0.node->is_unaligned || e1.node->is_unaligned) {
+		pack_unaligned_inner(e, e0, e1);
+	} else {
+		pack_aligned_inner(e, e0, e1);
+	}
+}
+
+void BVH2::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry& e0,
+                              const BVHStackEntry& e1)
+{
+	pack_aligned_node(e.idx,
+	                  e0.node->bounds, e1.node->bounds,
+	                  e0.encodeIdx(), e1.encodeIdx(),
+	                  e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_aligned_node(int idx,
+                             const BoundBox& b0,
+                             const BoundBox& b1,
+                             int c0, int c1,
+                             uint visibility0, uint visibility1)
+{
+	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+	assert(c0 < 0 || c0 < pack.nodes.size());
+	assert(c1 < 0 || c1 < pack.nodes.size());
+
+	int4 data[BVH_NODE_SIZE] = {
+		make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
+		          visibility1 & ~PATH_RAY_NODE_UNALIGNED,
+		          c0, c1),
+		make_int4(__float_as_int(b0.min.x),
+		          __float_as_int(b1.min.x),
+		          __float_as_int(b0.max.x),
+		          __float_as_int(b1.max.x)),
+		make_int4(__float_as_int(b0.min.y),
+		          __float_as_int(b1.min.y),
+		          __float_as_int(b0.max.y),
+		          __float_as_int(b1.max.y)),
+		make_int4(__float_as_int(b0.min.z),
+		          __float_as_int(b1.min.z),
+		          __float_as_int(b0.max.z),
+		          __float_as_int(b1.max.z)),
+	};
+
+	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
+}
+
+void BVH2::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry& e0,
+                                const BVHStackEntry& e1)
+{
+	pack_unaligned_node(e.idx,
+	                    e0.node->get_aligned_space(),
+	                    e1.node->get_aligned_space(),
+	                    e0.node->bounds,
+	                    e1.node->bounds,
+	                    e0.encodeIdx(), e1.encodeIdx(),
+	                    e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_unaligned_node(int idx,
+                               const Transform& aligned_space0,
+                               const Transform& aligned_space1,
+                               const BoundBox& bounds0,
+                               const BoundBox& bounds1,
+                               int c0, int c1,
+                               uint visibility0, uint visibility1)
+{
+	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
+	assert(c0 < 0 || c0 < pack.nodes.size());
+	assert(c1 < 0 || c1 < pack.nodes.size());
+
+	float4 data[BVH_UNALIGNED_NODE_SIZE];
+	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+	                                                        aligned_space0);
+	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+	                                                        aligned_space1);
+	data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(c0),
+	                      __int_as_float(c1));
+
+	data[1] = space0.x;
+	data[2] = space0.y;
+	data[3] = space0.z;
+	data[4] = space1.x;
+	data[5] = space1.y;
+	data[6] = space1.z;
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
+}
+
+void BVH2::pack_nodes(const BVHNode *root)
+{
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_NODE_SIZE;
+	}
+	/* Resize arrays */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_bvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_NODE_SIZE
+		                       : BVH_NODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* innner node */
+			int idx[2];
+			for(int i = 0; i < 2; ++i) {
+				if(e.node->get_child(i)->is_leaf()) {
+					idx[i] = nextLeafNodeIdx++;
+				}
+				else {
+					idx[i] = nextNodeIdx;
+					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+					                       ? BVH_UNALIGNED_NODE_SIZE
+					                       : BVH_NODE_SIZE;
+				}
+			}
+
+			stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
+			stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
+
+			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* root index to start traversal at, to handle case of single leaf node */
+	pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH2::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		/* refit leaf node */
+		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+		const int4 *data = &pack.leaf_nodes[idx];
+		const int c0 = data[0].x;
+		const int c1 = data[0].y;
+
+		BVH::refit_primitives(c0, c1, bbox, visibility);
+
+		/* TODO(sergey): De-duplicate with pack_leaf(). */
+		float4 leaf_data[BVH_NODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c0);
+		leaf_data[0].y = __int_as_float(c1);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(data[0].w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+	}
+	else {
+		assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+
+		const int4 *data = &pack.nodes[idx];
+		const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		const int c0 = data[0].z;
+		const int c1 = data[0].w;
+		/* refit inner node, set bbox from children */
+		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
+		uint visibility0 = 0, visibility1 = 0;
+
+		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
+		refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
+
+		if(is_unaligned) {
+			Transform aligned_space = transform_identity();
+			pack_unaligned_node(idx,
+			                    aligned_space, aligned_space,
+			                    bbox0, bbox1,
+			                    c0, c1,
+			                    visibility0,
+			                    visibility1);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  bbox0, bbox1,
+			                  c0, c1,
+			                  visibility0,
+			                  visibility1);
+		}
+
+		bbox.grow(bbox0);
+		bbox.grow(bbox1);
+		visibility = visibility0|visibility1;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
new file mode 100644
index 00000000000..df65ddca5b7
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH2_H__
+#define __BVH2_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_NODE_SIZE           4
+#define BVH_NODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_NODE_SIZE 7
+
+/* BVH2
+ *
+ * Typical BVH with each node having two children.
+ */
+class BVH2 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH2(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e,
+	               const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e,
+	                const BVHStackEntry& e0,
+	                const BVHStackEntry& e1);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry& e0,
+	                        const BVHStackEntry& e1);
+	void pack_aligned_node(int idx,
+	                       const BoundBox& b0,
+	                       const BoundBox& b1,
+	                       int c0, int c1,
+	                       uint visibility0, uint visibility1);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry& e0,
+	                          const BVHStackEntry& e1);
+	void pack_unaligned_node(int idx,
+	                         const Transform& aligned_space0,
+	                         const Transform& aligned_space1,
+	                         const BoundBox& b0,
+	                         const BoundBox& b1,
+	                         int c0, int c1,
+	                         uint visibility0, uint visibility1);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH2_H__ */
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
new file mode 100644
index 00000000000..4faf47af7bb
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -0,0 +1,455 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh4.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Can we avoid this somehow or make more generic?
+ *
+ * Perhaps we can merge nodes in actual tree and make our
+ * life easier all over the place.
+ */
+static bool node_qbvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	bool has_unaligned = false;
+	if(node0->is_leaf()) {
+		has_unaligned |= node0->is_unaligned;
+	}
+	else {
+		has_unaligned |= node0->get_child(0)->is_unaligned;
+		has_unaligned |= node0->get_child(1)->is_unaligned;
+	}
+	if(node1->is_leaf()) {
+		has_unaligned |= node1->is_unaligned;
+	}
+	else {
+		has_unaligned |= node1->get_child(0)->is_unaligned;
+		has_unaligned |= node1->get_child(1)->is_unaligned;
+	}
+	return has_unaligned;
+}
+
+BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+	params.bvh_layout = BVH_LAYOUT_BVH4;
+}
+
+void BVH4::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+	float4 data[BVH_QNODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+}
+
+void BVH4::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+	bool has_unaligned = false;
+	/* Check whether we have to create unaligned node or all nodes are aligned
+	 * and we can cut some corner here.
+	 */
+	if(params.use_unaligned_nodes) {
+		for(int i = 0; i < num; i++) {
+			if(en[i].node->is_unaligned) {
+				has_unaligned = true;
+				break;
+			}
+		}
+	}
+	if(has_unaligned) {
+		/* There's no unaligned children, pack into AABB node. */
+		pack_unaligned_inner(e, en, num);
+	}
+	else {
+		/* Create unaligned node with orientation transform for each of the
+		 * children.
+		 */
+		pack_aligned_inner(e, en, num);
+	}
+}
+
+void BVH4::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
+{
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_aligned_node(e.idx,
+	                  bounds,
+	                  child,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
+	                  num);
+}
+
+void BVH4::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
+	float4 data[BVH_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
+
+	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
+
+	for(int i = 0; i < num; i++) {
+		float3 bb_min = bounds[i].min;
+		float3 bb_max = bounds[i].max;
+
+		data[1][i] = bb_min.x;
+		data[2][i] = bb_max.x;
+		data[3][i] = bb_min.y;
+		data[4][i] = bb_max.y;
+		data[5][i] = bb_min.z;
+		data[6][i] = bb_max.z;
+
+		data[7][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 4; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+		data[1][i] = FLT_MAX;
+		data[2][i] = -FLT_MAX;
+
+		data[3][i] = FLT_MAX;
+		data[4][i] = -FLT_MAX;
+
+		data[5][i] = FLT_MAX;
+		data[6][i] = -FLT_MAX;
+
+		data[7][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
+}
+
+void BVH4::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+	Transform aligned_space[4];
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		aligned_space[i] = en[i].node->get_aligned_space();
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_unaligned_node(e.idx,
+	                    aligned_space,
+	                    bounds,
+	                    child,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
+	                    num);
+}
+
+void BVH4::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
+	float4 data[BVH_UNALIGNED_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
+
+	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
+
+	for(int i = 0; i < num; i++) {
+		Transform space = BVHUnaligned::compute_node_transform(
+		        bounds[i],
+		        aligned_space[i]);
+
+		data[1][i] = space.x.x;
+		data[2][i] = space.x.y;
+		data[3][i] = space.x.z;
+
+		data[4][i] = space.y.x;
+		data[5][i] = space.y.y;
+		data[6][i] = space.y.z;
+
+		data[7][i] = space.z.x;
+		data[8][i] = space.z.y;
+		data[9][i] = space.z.z;
+
+		data[10][i] = space.x.w;
+		data[11][i] = space.y.w;
+		data[12][i] = space.z.w;
+
+		data[13][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 4; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+
+		data[1][i] = NAN;
+		data[2][i] = NAN;
+		data[3][i] = NAN;
+
+		data[4][i] = NAN;
+		data[5][i] = NAN;
+		data[6][i] = NAN;
+
+		data[7][i] = NAN;
+		data[8][i] = NAN;
+		data[9][i] = NAN;
+
+		data[10][i] = NAN;
+		data[11][i] = NAN;
+		data[12][i] = NAN;
+
+		data[13][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH4::pack_nodes(const BVHNode *root)
+{
+	/* Calculate size of the arrays required. */
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_QNODE_SIZE;
+	}
+	/* Resize arrays. */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_qbvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_QNODE_SIZE
+		                       : BVH_QNODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* Inner node. */
+			const BVHNode *node = e.node;
+			const BVHNode *node0 = node->get_child(0);
+			const BVHNode *node1 = node->get_child(1);
+			/* Collect nodes. */
+			const BVHNode *nodes[4];
+			int numnodes = 0;
+			if(node0->is_leaf()) {
+				nodes[numnodes++] = node0;
+			}
+			else {
+				nodes[numnodes++] = node0->get_child(0);
+				nodes[numnodes++] = node0->get_child(1);
+			}
+			if(node1->is_leaf()) {
+				nodes[numnodes++] = node1;
+			}
+			else {
+				nodes[numnodes++] = node1->get_child(0);
+				nodes[numnodes++] = node1->get_child(1);
+			}
+			/* Push entries on the stack. */
+			for(int i = 0; i < numnodes; ++i) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx;
+					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+					                       ? BVH_UNALIGNED_QNODE_SIZE
+					                       : BVH_QNODE_SIZE;
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
+			/* Set node. */
+			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* Root index to start traversal at, to handle case of single leaf node. */
+	pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH4::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		/* Refit leaf node. */
+		int4 *data = &pack.leaf_nodes[idx];
+		int4 c = data[0];
+
+		BVH::refit_primitives(c.x, c.y, bbox, visibility);
+
+		/* TODO(sergey): This is actually a copy of pack_leaf(),
+		 * but this chunk of code only knows actual data and has
+		 * no idea about BVHNode.
+		 *
+		 * Would be nice to de-duplicate code, but trying to make
+		 * making code more general ends up in much nastier code
+		 * in my opinion so far.
+		 *
+		 * Same applies to the inner nodes case below.
+		 */
+		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
+		int4 *data = &pack.nodes[idx];
+		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		int4 c;
+		if(is_unaligned) {
+			c = data[13];
+		}
+		else {
+			c = data[7];
+		}
+		/* Refit inner node, set bbox from children. */
+		BoundBox child_bbox[4] = {BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty};
+		uint child_visibility[4] = {0};
+		int num_nodes = 0;
+
+		for(int i = 0; i < 4; ++i) {
+			if(c[i] != 0) {
+				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+				           child_bbox[i], child_visibility[i]);
+				++num_nodes;
+				bbox.grow(child_bbox[i]);
+				visibility |= child_visibility[i];
+			}
+		}
+
+		if(is_unaligned) {
+			Transform aligned_space[4] = {transform_identity(),
+			                              transform_identity(),
+			                              transform_identity(),
+			                              transform_identity()};
+			pack_unaligned_node(idx,
+			                    aligned_space,
+			                    child_bbox,
+			                    &c[0],
+			                    visibility,
+			                    0.0f,
+			                    1.0f,
+			                    4);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  child_bbox,
+			                  &c[0],
+			                  visibility,
+			                  0.0f,
+			                  1.0f,
+			                  4);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
new file mode 100644
index 00000000000..310909a37e1
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH4_H__
+#define __BVH4_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_QNODE_SIZE           8
+#define BVH_QNODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_QNODE_SIZE 14
+
+/* BVH4
+ *
+ * Quad BVH, with each node having four children, to use with SIMD instructions.
+ */
+class BVH4 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH4(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry *en,
+	                        int num);
+	void pack_aligned_node(int idx,
+	                       const BoundBox *bounds,
+	                       const int *child,
+	                       const uint visibility,
+	                       const float time_from,
+	                       const float time_to,
+	                       const int num);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry *en,
+	                          int num);
+	void pack_unaligned_node(int idx,
+	                         const Transform *aligned_space,
+	                         const BoundBox *bounds,
+	                         const int *child,
+	                         const uint visibility,
+	                         const float time_from,
+	                         const float time_to,
+	                         const int num);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 5ddd7349f7b..63a7fc11668 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -17,13 +17,13 @@
 
 //#define __KERNEL_SSE__
 
-#include <stdlib.h>
+#include "bvh/bvh_binning.h"
 
-#include "bvh_binning.h"
+#include <stdlib.h>
 
-#include "util_algorithm.h"
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 52955f70151..c2e259b1696 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_BINNING_H__
 #define __BVH_BINNING_H__
 
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,5 +111,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif
-
+#endif  /* __BVH_BINNING_H__ */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index fcbc50f4f6f..c0b3d683e37 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,26 +15,26 @@
  * limitations under the License.
  */
 
-#include "bvh_binning.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
+#include "bvh/bvh_build.h"
+
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
 #include "bvh_split.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_stack_allocator.h"
-#include "util_simd.h"
-#include "util_time.h"
-#include "util_queue.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_stack_allocator.h"
+#include "util/util_simd.h"
+#include "util/util_time.h"
+#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -128,7 +128,7 @@ void BVHBuild::add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *m
 		if(attr_mP == NULL) {
 			BoundBox bounds = BoundBox::empty;
 			t.bounds_grow(verts, bounds);
-			if(bounds.valid()) {
+			if(bounds.valid() && t.valid(verts)) {
 				references.push_back(BVHReference(bounds,
 				                                  j,
 				                                  i,
@@ -528,7 +528,9 @@ BVHNode* BVHBuild::run()
 			        << "  Allocation slop factor: "
 			               << ((prim_type.capacity() != 0)
 			                       ? (float)prim_type.size() / prim_type.capacity()
-			                       : 1.0f) << "\n";
+			                       : 1.0f) << "\n"
+			        << "  Maximum depth: "
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_DEPTH))  << "\n";
 		}
 	}
 
@@ -670,7 +672,7 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
 				return create_leaf_node(range, references);
 			}
 		}
-		/* Check whether unaligned split is better than the regulat one. */
+		/* Check whether unaligned split is better than the regular one. */
 		if(unalignedSplitSAH < splitSAH) {
 			do_unalinged_split = true;
 		}
@@ -864,10 +866,10 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
 		}
 
-		uint visibility = objects[ref->prim_object()]->visibility;
+		const uint visibility = objects[ref->prim_object()]->visibility_for_tracing();
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
-		leaf_node->m_time_from = ref->time_from();
-		leaf_node->m_time_to = ref->time_to();
+		leaf_node->time_from = ref->time_from();
+		leaf_node->time_to = ref->time_to();
 		return leaf_node;
 	}
 	else {
@@ -876,12 +878,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
 
 		BoundBox bounds = BoundBox::empty;
-		bounds.grow(leaf0->m_bounds);
-		bounds.grow(leaf1->m_bounds);
+		bounds.grow(leaf0->bounds);
+		bounds.grow(leaf1->bounds);
 
 		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
-		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
-		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
+		inner_node->time_from = min(leaf0->time_from, leaf1->time_from);
+		inner_node->time_to = max(leaf0->time_to, leaf1->time_to);
 		return inner_node;
 	}
 }
@@ -905,12 +907,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
+	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
-	vector<float2, LeafStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
+	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
 
 	/* TODO(sergey): In theory we should be able to store references. */
@@ -937,7 +940,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			                                         ref.time_to()));
 
 			bounds[type_index].grow(ref.bounds());
-			visibility[type_index] |= objects[ref.prim_object()]->visibility;
+			visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing();
 			if(ref.prim_type() & PRIMITIVE_ALL_CURVE) {
 				visibility[type_index] |= PATH_RAY_CURVE;
 			}
@@ -964,7 +967,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
-	vector<float2, LeafStackAllocator> local_prim_time;
+	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
@@ -1003,19 +1006,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 					time_from = min(time_from, ref.time_from());
 					time_to = max(time_to, ref.time_to());
 				}
-				leaf_node->m_time_from = time_from;
-				leaf_node->m_time_to = time_to;
+				leaf_node->time_from = time_from;
+				leaf_node->time_to = time_to;
 			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
-				leaf_node->m_bounds = BoundBox::empty;
+				leaf_node->bounds = BoundBox::empty;
 				for(int j = 0; j < num; ++j) {
 					const BVHReference &ref = p_ref[i][j];
 					BoundBox ref_bounds =
 					        unaligned_heuristic.compute_aligned_prim_boundbox(
 					                ref,
 					                aligned_space);
-					leaf_node->m_bounds.grow(ref_bounds);
+					leaf_node->bounds.grow(ref_bounds);
 				}
 				/* Set alignment space. */
 				leaf_node->set_aligned_space(aligned_space);
@@ -1038,7 +1041,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		 */
 		start_index = spatial_free_index;
 		spatial_free_index += range.size();
-
 		/* Extend an array when needed. */
 		const size_t range_end = start_index + range.size();
 		if(prim_type.size() < range_end) {
@@ -1064,8 +1066,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_time.resize(range_end);
 			}
 		}
-		spatial_spin_lock.unlock();
-
 		/* Perform actual data copy. */
 		if(new_leaf_data_size > 0) {
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
@@ -1075,6 +1075,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
 			}
 		}
+		spatial_spin_lock.unlock();
 	}
 	else {
 		/* For the regular BVH builder we simply copy new data starting at the
@@ -1098,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 */
 	for(int i = 0; i < num_leaves; ++i) {
 		LeafNode *leaf = (LeafNode *)leaves[i];
-		leaf->m_lo += start_index;
-		leaf->m_hi += start_index;
+		leaf->lo += start_index;
+		leaf->hi += start_index;
 	}
 
 	/* Create leaf node for object. */
@@ -1128,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
 	}
 	else if(num_leaves == 3) {
-		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
+		BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds);
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
 		/* Should be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
-		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
-		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
+		BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds);
 		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
 		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
-		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds);
 		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
 		if(num_leaves == 5) {
 			return new InnerNode(range.bounds(), inner_c, leaves[4]);
@@ -1173,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		rotate(parent->children[c], max_depth-1);
 
 	/* compute current area of all children */
-	BoundBox bounds0 = parent->children[0]->m_bounds;
-	BoundBox bounds1 = parent->children[1]->m_bounds;
+	BoundBox bounds0 = parent->children[0]->bounds;
+	BoundBox bounds1 = parent->children[1]->bounds;
 
 	float area0 = bounds0.half_area();
 	float area1 = bounds1.half_area();
@@ -1194,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		BoundBox& other = (c == 0)? bounds1: bounds0;
 
 		/* transpose child bounds */
-		BoundBox target0 = child->children[0]->m_bounds;
-		BoundBox target1 = child->children[1]->m_bounds;
+		BoundBox target0 = child->children[0]->bounds;
+		BoundBox target1 = child->children[1]->bounds;
 
 		/* compute cost for both possible swaps */
 		float cost0 = merge(other, target1).half_area() - child_area[c];
@@ -1227,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	InnerNode *child = (InnerNode*)parent->children[best_child];
 
 	swap(parent->children[best_other], child->children[best_target]);
-	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
+	child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 430efc3e0f6..7b245139819 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,17 +20,17 @@
 
 #include <float.h>
 
-#include "bvh.h"
-#include "bvh_binning.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_boundbox.h"
-#include "util_task.h"
-#include "util_vector.h"
+#include "util/util_task.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class Boundbox;
 class BVHBuildTask;
+class BVHNode;
 class BVHSpatialSplitBuildTask;
 class BVHParams;
 class InnerNode;
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 67580e1bc7b..24af919ff46 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
+#include "bvh/bvh_node.h"
 
-#include "util_debug.h"
-#include "util_vector.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
-			if(!is_unaligned()) {
+			if(!is_unaligned) {
 				cnt = 1;
 			}
 			break;
 		case BVH_STAT_UNALIGNED_COUNT:
-			if(is_unaligned()) {
+			if(is_unaligned) {
 				cnt = 1;
 			}
 			break;
@@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 0: 1;
 			}
@@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 1: 0;
 			}
@@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -126,11 +126,22 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && !is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && !is_unaligned) ? 1 : 0;
 			break;
 		case BVH_STAT_UNALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && is_unaligned) ? 1 : 0;
 			break;
+		case BVH_STAT_DEPTH:
+			if(is_leaf()) {
+				cnt = 1;
+			}
+			else {
+				for(int i = 0; i < num_children(); i++) {
+					cnt = max(cnt, get_child(i)->getSubtreeSize(stat));
+				}
+				cnt += 1;
+			}
+			return cnt;
 		default:
 			assert(0); /* unknown mode */
 	}
@@ -157,7 +168,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 	for(int i = 0; i < num_children(); i++) {
 		BVHNode *child = get_child(i);
-		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area());
+		SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area());
 	}
 
 	return SAH;
@@ -165,15 +176,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 uint BVHNode::update_visibility()
 {
-	if(!is_leaf() && m_visibility == 0) {
+	if(!is_leaf() && visibility == 0) {
 		InnerNode *inner = (InnerNode*)this;
 		BVHNode *child0 = inner->children[0];
 		BVHNode *child1 = inner->children[1];
 
-		m_visibility = child0->update_visibility()|child1->update_visibility();
+		visibility = child0->update_visibility()|child1->update_visibility();
 	}
 
-	return m_visibility;
+	return visibility;
 }
 
 void BVHNode::update_time()
@@ -184,8 +195,8 @@ void BVHNode::update_time()
 		BVHNode *child1 = inner->children[1];
 		child0->update_time();
 		child1->update_time();
-		m_time_from = min(child0->m_time_from, child1->m_time_from);
-		m_time_to =  max(child0->m_time_to, child1->m_time_to);
+		time_from = min(child0->time_from, child1->time_from);
+		time_to =  max(child0->time_to, child1->time_to);
 	}
 }
 
@@ -209,7 +220,7 @@ void LeafNode::print(int depth) const
 	for(int i = 0; i < depth; i++)
 		printf("  ");
 	
-	printf("leaf node %d to %d\n", m_lo, m_hi);
+	printf("leaf node %d to %d\n", lo, hi);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 090c426de56..94cf5ab730c 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -18,9 +18,8 @@
 #ifndef __BVH_NODE_H__
 #define __BVH_NODE_H__
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -39,6 +38,7 @@ enum BVH_STAT {
 	BVH_STAT_UNALIGNED_INNER_QNODE_COUNT,
 	BVH_STAT_ALIGNED_LEAF_COUNT,
 	BVH_STAT_UNALIGNED_LEAF_COUNT,
+	BVH_STAT_DEPTH,
 };
 
 class BVHParams;
@@ -46,16 +46,16 @@ class BVHParams;
 class BVHNode
 {
 public:
-	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL),
-	            m_time_from(0.0f),
-	            m_time_to(1.0f)
+	BVHNode() : is_unaligned(false),
+	            aligned_space(NULL),
+	            time_from(0.0f),
+	            time_to(1.0f)
 	{
 	}
 
 	virtual ~BVHNode()
 	{
-		delete m_aligned_space;
+		delete aligned_space;
 	}
 
 	virtual bool is_leaf() const = 0;
@@ -63,30 +63,26 @@ public:
 	virtual BVHNode *get_child(int i) const = 0;
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;
-	bool is_unaligned() const { return m_is_unaligned; }
 
 	inline void set_aligned_space(const Transform& aligned_space)
 	{
-		m_is_unaligned = true;
-		if(m_aligned_space == NULL) {
-			m_aligned_space = new Transform(aligned_space);
+		is_unaligned = true;
+		if(this->aligned_space == NULL) {
+			this->aligned_space = new Transform(aligned_space);
 		}
 		else {
-			*m_aligned_space = aligned_space;
+			*this->aligned_space = aligned_space;
 		}
 	}
 
 	inline Transform get_aligned_space() const
 	{
-		if(m_aligned_space == NULL) {
+		if(aligned_space == NULL) {
 			return transform_identity();
 		}
-		return *m_aligned_space;
+		return *aligned_space;
 	}
 
-	BoundBox m_bounds;
-	uint m_visibility;
-
 	// Subtree functions
 	int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const;
 	float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const;
@@ -95,13 +91,18 @@ public:
 	uint update_visibility();
 	void update_time();
 
-	bool m_is_unaligned;
+	// Properties.
+	BoundBox bounds;
+	uint visibility;
+
+	bool is_unaligned;
 
-	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
-	// utilities and type defines in util_transform first.
-	Transform *m_aligned_space;
+	/* TODO(sergey): Can be stored as 3x3 matrix, but better to have some
+	 * utilities and type defines in util_transform first.
+	 */
+	Transform *aligned_space;
 
-	float m_time_from, m_time_to;
+	float time_from, time_to;
 };
 
 class InnerNode : public BVHNode
@@ -111,20 +112,20 @@ public:
 	          BVHNode* child0,
 	          BVHNode* child1)
 	{
-		m_bounds = bounds;
+		this->bounds = bounds;
 		children[0] = child0;
 		children[1] = child1;
 
 		if(child0 && child1)
-			m_visibility = child0->m_visibility|child1->m_visibility;
+			visibility = child0->visibility|child1->visibility;
 		else
-			m_visibility = 0; /* happens on build cancel */
+			visibility = 0; /* happens on build cancel */
 	}
 
 	explicit InnerNode(const BoundBox& bounds)
 	{
-		m_bounds = bounds;
-		m_visibility = 0;
+		this->bounds = bounds;
+		visibility = 0;
 		children[0] = NULL;
 		children[1] = NULL;
 	}
@@ -140,12 +141,12 @@ public:
 class LeafNode : public BVHNode
 {
 public:
-	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) 
+	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi)
+	: lo(lo),
+	  hi(hi)
 	{
-		m_bounds = bounds;
-		m_visibility = visibility;
-		m_lo = lo;
-		m_hi = hi;
+		this->bounds = bounds;
+		this->visibility = visibility;
 	}
 
 	LeafNode(const LeafNode& s)
@@ -157,14 +158,13 @@ public:
 	bool is_leaf() const { return true; }
 	int num_children() const { return 0; }
 	BVHNode *get_child(int) const { return NULL; }
-	int num_triangles() const { return m_hi - m_lo; }
+	int num_triangles() const { return hi - lo; }
 	void print(int depth) const;
 
-	int m_lo;
-	int m_hi;
+	int lo;
+	int hi;
 };
 
 CCL_NAMESPACE_END
 
 #endif /* __BVH_NODE_H__ */
-
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 7b309504728..89a379cf356 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -18,17 +18,35 @@
 #ifndef __BVH_PARAMS_H__
 #define __BVH_PARAMS_H__
 
-#include "util_boundbox.h"
+#include "util/util_boundbox.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
+/* Layout of BVH tree.
+ *
+ * For example, how wide BVH tree is, in terms of number of children
+ * per node.
+ */
+typedef KernelBVHLayout BVHLayout;
+
+/* Names bitflag type to denote which BVH layouts are supported by
+ * particular area.
+ *
+ * Bitflags are the BVH_LAYOUT_* values.
+ */
+typedef int BVHLayoutMask;
+
+/* Get human readable name of BVH layout. */
+const char *bvh_layout_name(BVHLayout layout);
+
 /* BVH Parameters */
 
 class BVHParams
 {
 public:
+
 	/* spatial split area threshold */
 	bool use_spatial_split;
 	float spatial_split_alpha;
@@ -50,8 +68,8 @@ public:
 	/* object or mesh level bvh */
 	bool top_level;
 
-	/* QBVH */
-	bool use_qbvh;
+	/* BVH layout to be built. */
+	BVHLayout bvh_layout;
 
 	/* Mask of primitives to be included into the BVH. */
 	int primitive_mask;
@@ -98,7 +116,7 @@ public:
 		max_motion_curve_leaf_size = 4;
 
 		top_level = false;
-		use_qbvh = false;
+		bvh_layout = BVH_LAYOUT_BVH2;
 		use_unaligned_nodes = false;
 
 		primitive_mask = PRIMITIVE_ALL;
@@ -119,6 +137,14 @@ public:
 
 	__forceinline bool small_enough_for_leaf(int size, int level)
 	{ return (size <= min_leaf_size || level >= MAX_DEPTH); }
+
+	/* Gets best matching BVH.
+	 *
+	 * If the requested layout is supported by the device, it will be used.
+	 * Otherwise, widest supported layout below that will be used.
+	 */
+	static BVHLayout best_bvh_layout(BVHLayout requested_layout,
+	                                 BVHLayoutMask supported_layouts);
 };
 
 /* BVH Reference
@@ -246,4 +272,3 @@ struct BVHSpatialStorage {
 CCL_NAMESPACE_END
 
 #endif /* __BVH_PARAMS_H__ */
-
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index e5bcf9995bf..b40bf5bb21b 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_sort.h"
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_task.h"
+#include "bvh/bvh_build.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h
index b49ca02eb60..936401d8607 100644
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@@ -18,8 +18,11 @@
 #ifndef __BVH_SORT_H__
 #define __BVH_SORT_H__
 
+#include <cstddef>
+
 CCL_NAMESPACE_BEGIN
 
+class BVHReference;
 class BVHUnaligned;
 struct Transform;
 
@@ -33,4 +36,3 @@ void bvh_reference_sort(int start,
 CCL_NAMESPACE_END
 
 #endif /* __BVH_SORT_H__ */
-
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index d0d5fbe5a7a..c55ba40b565 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,14 +15,15 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_split.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_split.h"
 
-#include "mesh.h"
-#include "object.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_sort.h"
 
-#include "util_algorithm.h"
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index dbdb51f1a5b..a874a118b99 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -18,8 +18,8 @@
 #ifndef __BVH_SPLIT_H__
 #define __BVH_SPLIT_H__
 
-#include "bvh_build.h"
-#include "bvh_params.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_params.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index a876c670914..910f82137c5 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -14,18 +14,16 @@
  * limitations under the License.
  */
 
+#include "bvh/bvh_unaligned.h"
 
-#include "bvh_unaligned.h"
+#include "render/mesh.h"
+#include "render/object.h"
 
-#include "mesh.h"
-#include "object.h"
-
-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 #include "bvh_params.h"
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_transform.h"
+#include "util/util_boundbox.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index 4d0872f4a39..c3ece051cd5 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -17,7 +17,7 @@
 #ifndef __BVH_UNALIGNED_H__
 #define __BVH_UNALIGNED_H__
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -78,4 +78,3 @@ protected:
 CCL_NAMESPACE_END
 
 #endif /* __BVH_UNALIGNED_H__ */
-
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 403a0540963..8d04025e6fd 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -30,7 +30,7 @@ if(NOT CYCLES_STANDALONE_REPOSITORY)
 	set(GLEW_INCLUDE_DIR "${GLEW_INCLUDE_PATH}")
 endif()
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
+if(WITH_CYCLES_STANDALONE)
 	set(CYCLES_APP_GLEW_LIBRARY ${BLENDER_GLEW_LIBRARIES})
 endif()
 
@@ -135,13 +135,5 @@ if(CYCLES_STANDALONE_REPOSITORY)
 
 	unset(_lib_DIR)
 else()
-	if(WIN32)
-		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src/windows)
-		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
-	else()
-		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src)
-		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
-	endif()
-	set(GFLAGS_NAMESPACE "gflags")
 	set(LLVM_LIBRARIES ${LLVM_LIBRARY})
 endif()
diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake
new file mode 100644
index 00000000000..f3ca06ac6b8
--- /dev/null
+++ b/intern/cycles/cmake/macros.cmake
@@ -0,0 +1,12 @@
+function(cycles_set_solution_folder target)
+	if(WINDOWS_USE_VISUAL_STUDIO_FOLDERS)
+		get_filename_component(folderdir ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+		string(REPLACE ${CMAKE_SOURCE_DIR} "" folderdir ${folderdir})
+		set_target_properties(${target} PROPERTIES FOLDER ${folderdir})
+	endif()
+endfunction()
+
+macro(cycles_add_library target)
+	add_library(${target} ${ARGN})
+	cycles_set_solution_folder(${target})
+endmacro()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52ba..75e78e038ea 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../util
-	../render
+	..
 	../../glew-mx
 )
 
@@ -31,18 +25,23 @@ set(SRC
 	device.cpp
 	device_cpu.cpp
 	device_cuda.cpp
+	device_denoising.cpp
+	device_memory.cpp
 	device_multi.cpp
 	device_opencl.cpp
+	device_split_kernel.cpp
 	device_task.cpp
 )
 
 set(SRC_OPENCL
 	opencl/opencl.h
+	opencl/memory_manager.h
 
 	opencl/opencl_base.cpp
 	opencl/opencl_mega.cpp
 	opencl/opencl_split.cpp
 	opencl/opencl_util.cpp
+	opencl/memory_manager.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
@@ -53,9 +52,11 @@ endif()
 
 set(SRC_HEADERS
 	device.h
+	device_denoising.h
 	device_memory.h
 	device_intern.h
 	device_network.h
+	device_split_kernel.h
 	device_task.h
 )
 
@@ -76,4 +77,4 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 31c99f49d6d..6959dd73c32 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,23 +17,25 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_half.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
-#include "util_vector.h"
-#include "util_string.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+
+#include "util/util_foreach.h"
+#include "util/util_half.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
+thread_mutex Device::device_mutex;
 vector<DeviceType> Device::types;
 vector<DeviceInfo> Device::devices;
 
@@ -44,15 +46,14 @@ std::ostream& operator <<(std::ostream &os,
 {
 	os << "Experimental features: "
 	   << (requested_features.experimental ? "On" : "Off") << std::endl;
-	os << "Max closure count: " << requested_features.max_closure << std::endl;
 	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
 	/* TODO(sergey): Decode bitflag into list of names. */
 	os << "Nodes features: " << requested_features.nodes_features << std::endl;
-	os << "Use hair: "
+	os << "Use Hair: "
 	   << string_from_bool(requested_features.use_hair) << std::endl;
-	os << "Use object motion: "
+	os << "Use Object Motion: "
 	   << string_from_bool(requested_features.use_object_motion) << std::endl;
-	os << "Use camera motion: "
+	os << "Use Camera Motion: "
 	   << string_from_bool(requested_features.use_camera_motion) << std::endl;
 	os << "Use Baking: "
 	   << string_from_bool(requested_features.use_baking) << std::endl;
@@ -66,6 +67,10 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
 	os << "Use Transparent Shadows: "
 	   << string_from_bool(requested_features.use_transparent) << std::endl;
+	os << "Use Principled BSDF: "
+	   << string_from_bool(requested_features.use_principled) << std::endl;
+	os << "Use Denoising: "
+	   << string_from_bool(requested_features.use_denoising) << std::endl;
 	return os;
 }
 
@@ -78,28 +83,12 @@ Device::~Device()
 	}
 }
 
-void Device::pixels_alloc(device_memory& mem)
-{
-	mem_alloc(mem, MEM_READ_WRITE);
-}
-
-void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
-{
-	if(mem.data_type == TYPE_HALF)
-		mem_copy_from(mem, y, w, h, sizeof(half4));
-	else
-		mem_copy_from(mem, y, w, h, sizeof(uchar4));
-}
-
-void Device::pixels_free(device_memory& mem)
-{
-	mem_free(mem);
-}
-
 void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 	const DeviceDrawParams &draw_params)
 {
-	pixels_copy_from(rgba, y, w, h);
+	assert(rgba.type == MEM_PIXELS);
+
+	mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
 
 	if(transparent) {
 		glEnable(GL_BLEND);
@@ -111,17 +100,17 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d
 	if(rgba.data_type == TYPE_HALF) {
 		/* for multi devices, this assumes the inefficient method that we allocate
 		 * all pixels on the device even though we only render to a subset */
-		GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
+		GLhalf *host_pointer = (GLhalf*)rgba.host_pointer;
 		float vbuffer[16], *basep;
 		float *vp = NULL;
 
-		data_pointer += 4*y*w;
+		host_pointer += 4*y*w;
 
 		/* draw half float texture, GLSL shader for display transform assumed to be bound */
 		GLuint texid;
 		glGenTextures(1, &texid);
 		glBindTexture(GL_TEXTURE_2D, texid);
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, host_pointer);
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 
@@ -203,7 +192,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d
 		glPixelZoom((float)width/(float)w, (float)height/(float)h);
 		glRasterPos2f(dx, dy);
 
-		uint8_t *pixels = (uint8_t*)rgba.data_pointer;
+		uint8_t *pixels = (uint8_t*)rgba.host_pointer;
 
 		pixels += 4*y*w;
 
@@ -292,53 +281,49 @@ string Device::string_from_type(DeviceType type)
 
 vector<DeviceType>& Device::available_types()
 {
+	thread_scoped_lock lock(device_mutex);
 	if(need_types_update) {
 		types.clear();
 		types.push_back(DEVICE_CPU);
-
 #ifdef WITH_CUDA
-		if(device_cuda_init())
+		if(device_cuda_init()) {
 			types.push_back(DEVICE_CUDA);
+		}
 #endif
-
 #ifdef WITH_OPENCL
-		if(device_opencl_init())
+		if(device_opencl_init()) {
 			types.push_back(DEVICE_OPENCL);
+		}
 #endif
-
 #ifdef WITH_NETWORK
 		types.push_back(DEVICE_NETWORK);
 #endif
-
 		need_types_update = false;
 	}
-
 	return types;
 }
 
 vector<DeviceInfo>& Device::available_devices()
 {
+	thread_scoped_lock lock(device_mutex);
 	if(need_devices_update) {
 		devices.clear();
-#ifdef WITH_CUDA
-		if(device_cuda_init())
-			device_cuda_info(devices);
-#endif
-
 #ifdef WITH_OPENCL
-		if(device_opencl_init())
+		if(device_opencl_init()) {
 			device_opencl_info(devices);
+		}
+#endif
+#ifdef WITH_CUDA
+		if(device_cuda_init()) {
+			device_cuda_info(devices);
+		}
 #endif
-
 		device_cpu_info(devices);
-
 #ifdef WITH_NETWORK
 		device_network_info(devices);
 #endif
-
 		need_devices_update = false;
 	}
-
 	return devices;
 }
 
@@ -346,12 +331,6 @@ string Device::device_capabilities()
 {
 	string capabilities = "CPU device capabilities: ";
 	capabilities += device_cpu_capabilities() + "\n";
-#ifdef WITH_CUDA
-	if(device_cuda_init()) {
-		capabilities += "\nCUDA device capabilities:\n";
-		capabilities += device_cuda_capabilities();
-	}
-#endif
 
 #ifdef WITH_OPENCL
 	if(device_opencl_init()) {
@@ -360,10 +339,17 @@ string Device::device_capabilities()
 	}
 #endif
 
+#ifdef WITH_CUDA
+	if(device_cuda_init()) {
+		capabilities += "\nCUDA device capabilities:\n";
+		capabilities += device_cuda_capabilities();
+	}
+#endif
+
 	return capabilities;
 }
 
-DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
+DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int threads, bool background)
 {
 	assert(subdevices.size() > 1);
 
@@ -371,16 +357,47 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
 	info.type = DEVICE_MULTI;
 	info.id = "MULTI";
 	info.description = "Multi Device";
-	info.multi_devices = subdevices;
 	info.num = 0;
 
-	info.has_bindless_textures = true;
-	info.pack_images = false;
-	foreach(DeviceInfo &device, subdevices) {
-		assert(device.type == info.multi_devices[0].type);
+	info.has_half_images = true;
+	info.has_volume_decoupled = true;
+	info.bvh_layout_mask = BVH_LAYOUT_ALL;
+	info.has_osl = true;
+
+	foreach(const DeviceInfo &device, subdevices) {
+		/* Ensure CPU device does not slow down GPU. */
+		if(device.type == DEVICE_CPU && subdevices.size() > 1) {
+			if(background) {
+				int orig_cpu_threads = (threads)? threads: system_cpu_thread_count();
+				int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);
+
+				VLOG(1) << "CPU render threads reduced from "
+						<< orig_cpu_threads << " to " << cpu_threads
+						<< ", to dedicate to GPU.";
+
+				if(cpu_threads >= 1) {
+					DeviceInfo cpu_device = device;
+					cpu_device.cpu_threads = cpu_threads;
+					info.multi_devices.push_back(cpu_device);
+				}
+				else {
+					continue;
+				}
+			}
+			else {
+				VLOG(1) << "CPU render threads disabled for interactive render.";
+				continue;
+			}
+		}
+		else {
+			info.multi_devices.push_back(device);
+		}
 
-		info.pack_images |= device.pack_images;
-		info.has_bindless_textures &= device.has_bindless_textures;
+		/* Accumulate device info. */
+		info.has_half_images &= device.has_half_images;
+		info.has_volume_decoupled &= device.has_volume_decoupled;
+		info.bvh_layout_mask = device.bvh_layout_mask & info.bvh_layout_mask;
+		info.has_osl &= device.has_osl;
 	}
 
 	return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ccee25ae34e..b856bdd9d01 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -19,15 +19,18 @@
 
 #include <stdlib.h>
 
-#include "device_memory.h"
-#include "device_task.h"
+#include "bvh/bvh_params.h"
 
-#include "util_list.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "device/device_memory.h"
+#include "device/device_task.h"
+
+#include "util/util_list.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,11 +54,14 @@ public:
 	string description;
 	string id; /* used for user preferences, should stay fixed with changing hardware config */
 	int num;
-	bool display_device;
-	bool advanced_shading;
-	bool pack_images;
-	bool has_bindless_textures; /* flag for GPU and Multi device */
-	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
+	bool display_device;            /* GPU is used as a display device. */
+	bool advanced_shading;          /* Supports full shading system. */
+	bool has_half_images;           /* Support half-float textures. */
+	bool has_volume_decoupled;      /* Decoupled volume shading. */
+	BVHLayoutMask bvh_layout_mask;  /* Bitmask of supported BVH layouts. */
+	bool has_osl;                   /* Support Open Shading Language. */
+	bool use_split_kernel;          /* Use split or mega kernel. */
+	int cpu_threads;
 	vector<DeviceInfo> multi_devices;
 
 	DeviceInfo()
@@ -63,10 +69,13 @@ public:
 		type = DEVICE_CPU;
 		id = "CPU";
 		num = 0;
+		cpu_threads = 0;
 		display_device = false;
 		advanced_shading = true;
-		pack_images = false;
-		has_bindless_textures = false;
+		has_half_images = false;
+		has_volume_decoupled = false;
+		bvh_layout_mask = BVH_LAYOUT_NONE;
+		has_osl = false;
 		use_split_kernel = false;
 	}
 
@@ -82,9 +91,6 @@ public:
 	/* Use experimental feature set. */
 	bool experimental;
 
-	/* Maximum number of closures in shader trees. */
-	int max_closure;
-
 	/* Selective nodes compilation. */
 
 	/* Identifier of a node group up to which all the nodes needs to be
@@ -121,11 +127,22 @@ public:
 	/* Use Transparent shadows */
 	bool use_transparent;
 
+	/* Use various shadow tricks, such as shadow catcher. */
+	bool use_shadow_tricks;
+
+	/* Per-uber shader usage flags. */
+	bool use_principled;
+
+	/* Denoising features. */
+	bool use_denoising;
+
+	/* Use raytracing in shaders. */
+	bool use_shader_raytrace;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
 		experimental = false;
-		max_closure = 0;
 		max_nodes_group = 0;
 		nodes_features = 0;
 		use_hair = false;
@@ -137,12 +154,15 @@ public:
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
 		use_transparent = false;
+		use_shadow_tricks = false;
+		use_principled = false;
+		use_denoising = false;
+		use_shader_raytrace = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
 	{
 		return !(experimental == requested_features.experimental &&
-		         max_closure == requested_features.max_closure &&
 		         max_nodes_group == requested_features.max_nodes_group &&
 		         nodes_features == requested_features.nodes_features &&
 		         use_hair == requested_features.use_hair &&
@@ -153,7 +173,11 @@ public:
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent);
+		         use_transparent == requested_features.use_transparent &&
+		         use_shadow_tricks == requested_features.use_shadow_tricks &&
+		         use_principled == requested_features.use_principled &&
+		         use_denoising == requested_features.use_denoising &&
+		         use_shader_raytrace == requested_features.use_shader_raytrace);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -169,7 +193,6 @@ public:
 			string_printf("%d", max_nodes_group);
 		build_options += " -D__NODES_FEATURES__=" +
 			string_printf("%d", nodes_features);
-		build_options += string_printf(" -D__MAX_CLOSURE__=%d", max_closure);
 		if(!use_hair) {
 			build_options += " -D__NO_HAIR__";
 		}
@@ -194,9 +217,21 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
-		if(!use_transparent) {
+		if(!use_transparent && !use_volume) {
 			build_options += " -D__NO_TRANSPARENT__";
 		}
+		if(!use_shadow_tricks) {
+			build_options += " -D__NO_SHADOW_TRICKS__";
+		}
+		if(!use_principled) {
+			build_options += " -D__NO_PRINCIPLED__";
+		}
+		if(!use_denoising) {
+			build_options += " -D__NO_DENOISING__";
+		}
+		if(!use_shader_raytrace) {
+			build_options += " -D__NO_SHADER_RAYTRACE__";
+		}
 		return build_options;
 	}
 };
@@ -212,6 +247,7 @@ struct DeviceDrawParams {
 };
 
 class Device {
+	friend class device_sub_ptr;
 protected:
 	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
 
@@ -221,6 +257,14 @@ protected:
 	/* used for real time display */
 	unsigned int vertex_buffer;
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/)
+	{
+		/* Only required for devices that implement denoising. */
+		assert(false);
+		return (device_ptr) 0;
+	}
+	virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
 public:
 	virtual ~Device();
 
@@ -228,39 +272,25 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual void set_error(const string& error)
+	{
+		if(!have_error()) {
+			error_msg = error;
+		}
+		fprintf(stderr, "%s\n", error.c_str());
+		fflush(stderr);
+	}
 	virtual bool show_samples() const { return false; }
 
 	/* statistics */
 	Stats &stats;
 
-	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
-	virtual void mem_copy_to(device_memory& mem) = 0;
-	virtual void mem_copy_from(device_memory& mem,
-		int y, int w, int h, int elem) = 0;
-	virtual void mem_zero(device_memory& mem) = 0;
-	virtual void mem_free(device_memory& mem) = 0;
+	/* memory alignment */
+	virtual int mem_sub_ptr_alignment() { return MIN_ALIGNMENT_CPU_DATA_TYPES; }
 
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-	/* texture memory */
-	virtual void tex_alloc(const char * /*name*/,
-	                       device_memory& /*mem*/,
-	                       InterpolationType interpolation = INTERPOLATION_NONE,
-	                       ExtensionType extension = EXTENSION_REPEAT)
-	{
-		(void)interpolation;  /* Ignored. */
-		(void)extension;  /* Ignored. */
-	};
-
-	virtual void tex_free(device_memory& /*mem*/) {};
-
-	/* pixel memory */
-	virtual void pixels_alloc(device_memory& mem);
-	virtual void pixels_copy_from(device_memory& mem, int y, int w, int h);
-	virtual void pixels_free(device_memory& mem);
-
 	/* open shading language, only for CPU device */
 	virtual void *osl_memory() { return NULL; }
 
@@ -288,6 +318,8 @@ public:
 	/* multi device */
 	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
 	virtual int device_number(Device * /*sub_device*/) { return 0; }
+	virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+	virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
 
 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
@@ -297,15 +329,32 @@ public:
 	static vector<DeviceType>& available_types();
 	static vector<DeviceInfo>& available_devices();
 	static string device_capabilities();
-	static DeviceInfo get_multi_device(vector<DeviceInfo> subdevices);
+	static DeviceInfo get_multi_device(const vector<DeviceInfo>& subdevices,
+	                                   int threads,
+	                                   bool background);
 
 	/* Tag devices lists for update. */
 	static void tag_update();
 
 	static void free_memory();
+
+protected:
+	/* Memory allocation, only accessed through device_memory. */
+	friend class MultiDevice;
+	friend class DeviceServer;
+	friend class device_memory;
+
+	virtual void mem_alloc(device_memory& mem) = 0;
+	virtual void mem_copy_to(device_memory& mem) = 0;
+	virtual void mem_copy_from(device_memory& mem,
+		int y, int w, int h, int elem) = 0;
+	virtual void mem_zero(device_memory& mem) = 0;
+	virtual void mem_free(device_memory& mem) = 0;
+
 private:
 	/* Indicted whether device types and devices lists were initialized. */
 	static bool need_types_update, need_devices_update;
+	static thread_mutex device_mutex;
 	static vector<DeviceType> types;
 	static vector<DeviceInfo> devices;
 };
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c8e001ec2fd..6be60f8bbb6 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -20,113 +20,314 @@
 /* So ImathMath is included before our kernel_cpu_compat. */
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_denoising.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "kernel.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
 
-#include "osl_shader.h"
-#include "osl_globals.h"
+#include "kernel/filter/filter.h"
 
-#include "buffers.h"
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_opengl.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_thread.h"
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
-class CPUDevice : public Device
-{
-public:
-	TaskPool task_pool;
-	KernelGlobals kernel_globals;
+class CPUDevice;
 
-#ifdef WITH_OSL
-	OSLGlobals osl_globals;
-#endif
-	
-	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
-	: Device(info, stats, background)
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
+
+template<typename F>
+class KernelFunctions {
+public:
+	KernelFunctions()
 	{
-#ifdef WITH_OSL
-		kernel_globals.osl = &osl_globals;
-#endif
+		kernel = (F)NULL;
+	}
 
-		/* do now to avoid thread issues */
-		system_cpu_support_sse2();
-		system_cpu_support_sse3();
-		system_cpu_support_sse41();
-		system_cpu_support_avx();
-		system_cpu_support_avx2();
+	KernelFunctions(F kernel_default,
+	                F kernel_sse2,
+	                F kernel_sse3,
+	                F kernel_sse41,
+	                F kernel_avx,
+	                F kernel_avx2)
+	{
+		const char *architecture_name = "default";
+		kernel = kernel_default;
 
+		/* Silence potential warnings about unused variables
+		 * when compiling without some architectures. */
+		(void)kernel_sse2;
+		(void)kernel_sse3;
+		(void)kernel_sse41;
+		(void)kernel_avx;
+		(void)kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			VLOG(1) << "Will be using AVX2 kernels.";
+		if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+			architecture_name = "AVX2";
+			kernel = kernel_avx2;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			VLOG(1) << "Will be using AVX kernels.";
+		if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+			architecture_name = "AVX";
+			kernel = kernel_avx;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			VLOG(1) << "Will be using SSE4.1 kernels.";
+		if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+			architecture_name = "SSE4.1";
+			kernel = kernel_sse41;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			VLOG(1) << "Will be using SSE3kernels.";
+		if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+			architecture_name = "SSE3";
+			kernel = kernel_sse3;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			VLOG(1) << "Will be using SSE2 kernels.";
+		if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+			architecture_name = "SSE2";
+			kernel = kernel_sse2;
 		}
-		else
 #endif
-		{
-			VLOG(1) << "Will be using regular kernels.";
+
+		if(strcmp(architecture_name, logged_architecture) != 0) {
+			VLOG(1) << "Will be using " << architecture_name << " kernels.";
+			logged_architecture = architecture_name;
 		}
 	}
 
+	inline F operator()() const {
+		assert(kernel);
+		return kernel;
+	}
+protected:
+	F kernel;
+};
+
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
+class CPUDevice : public Device
+{
+public:
+	TaskPool task_pool;
+	KernelGlobals kernel_globals;
+
+	device_vector<TextureInfo> texture_info;
+	bool need_texture_info;
+
+#ifdef WITH_OSL
+	OSLGlobals osl_globals;
+#endif
+
+	bool use_split_kernel;
+
+	DeviceRequestedFeatures requested_features;
+
+	KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)>             path_trace_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
+
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)>               filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
+
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
+
+	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+	                       int, int, int, int, int, int, int, int, ccl_global int*, int,
+	                       ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
+	unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+	      KERNEL_NAME_EVAL(cpu, name), \
+	      KERNEL_NAME_EVAL(cpu_sse2, name), \
+	      KERNEL_NAME_EVAL(cpu_sse3, name), \
+	      KERNEL_NAME_EVAL(cpu_sse41, name), \
+	      KERNEL_NAME_EVAL(cpu_avx, name), \
+	      KERNEL_NAME_EVAL(cpu_avx2, name)
+
+	CPUDevice(DeviceInfo& info_, Stats &stats_, bool background_)
+	: Device(info_, stats_, background_),
+	  texture_info(this, "__texture_info", MEM_TEXTURE),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+	  REGISTER_KERNEL(path_trace),
+	  REGISTER_KERNEL(convert_to_half_float),
+	  REGISTER_KERNEL(convert_to_byte),
+	  REGISTER_KERNEL(shader),
+	  REGISTER_KERNEL(filter_divide_shadow),
+	  REGISTER_KERNEL(filter_get_feature),
+	  REGISTER_KERNEL(filter_detect_outliers),
+	  REGISTER_KERNEL(filter_combine_halves),
+	  REGISTER_KERNEL(filter_nlm_calc_difference),
+	  REGISTER_KERNEL(filter_nlm_blur),
+	  REGISTER_KERNEL(filter_nlm_calc_weight),
+	  REGISTER_KERNEL(filter_nlm_update_output),
+	  REGISTER_KERNEL(filter_nlm_normalize),
+	  REGISTER_KERNEL(filter_construct_transform),
+	  REGISTER_KERNEL(filter_nlm_construct_gramian),
+	  REGISTER_KERNEL(filter_finalize),
+	  REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
+	{
+		if(info.cpu_threads == 0) {
+			info.cpu_threads = TaskScheduler::num_threads();
+		}
+
+#ifdef WITH_OSL
+		kernel_globals.osl = &osl_globals;
+#endif
+		use_split_kernel = DebugFlags().cpu.split_kernel;
+		if(use_split_kernel) {
+			VLOG(1) << "Will be using split kernel.";
+		}
+		need_texture_info = false;
+
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+		REGISTER_SPLIT_KERNEL(path_init);
+		REGISTER_SPLIT_KERNEL(scene_intersect);
+		REGISTER_SPLIT_KERNEL(lamp_emission);
+		REGISTER_SPLIT_KERNEL(do_volume);
+		REGISTER_SPLIT_KERNEL(queue_enqueue);
+		REGISTER_SPLIT_KERNEL(indirect_background);
+		REGISTER_SPLIT_KERNEL(shader_setup);
+		REGISTER_SPLIT_KERNEL(shader_sort);
+		REGISTER_SPLIT_KERNEL(shader_eval);
+		REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		REGISTER_SPLIT_KERNEL(subsurface_scatter);
+		REGISTER_SPLIT_KERNEL(direct_lighting);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+		REGISTER_SPLIT_KERNEL(enqueue_inactive);
+		REGISTER_SPLIT_KERNEL(next_iteration_setup);
+		REGISTER_SPLIT_KERNEL(indirect_subsurface);
+		REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
+	}
+
 	~CPUDevice()
 	{
 		task_pool.stop();
+		texture_info.free();
 	}
 
 	virtual bool show_samples() const
 	{
-		return (TaskScheduler::num_threads() == 1);
+		return (info.cpu_threads == 1);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void load_texture_info()
 	{
-		mem.device_pointer = mem.data_pointer;
-		mem.device_size = mem.memory_size();
-		stats.mem_alloc(mem.device_size);
+		if(need_texture_info) {
+			texture_info.copy_to_device();
+			need_texture_info = false;
+		}
 	}
 
-	void mem_copy_to(device_memory& /*mem*/)
+	void mem_alloc(device_memory& mem)
 	{
-		/* no-op */
+		if(mem.type == MEM_TEXTURE) {
+			assert(!"mem_alloc not supported for textures.");
+		}
+		else {
+			if(mem.name) {
+				VLOG(1) << "Buffer allocate: " << mem.name << ", "
+						<< string_human_readable_number(mem.memory_size()) << " bytes. ("
+						<< string_human_readable_size(mem.memory_size()) << ")";
+			}
+
+			if(mem.type == MEM_DEVICE_ONLY) {
+				assert(!mem.host_pointer);
+				size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+				void *data = util_aligned_malloc(mem.memory_size(), alignment);
+				mem.device_pointer = (device_ptr)data;
+			}
+			else {
+				mem.device_pointer = (device_ptr)mem.host_pointer;
+			}
+
+			mem.device_size = mem.memory_size();
+			stats.mem_alloc(mem.device_size);
+		}
+	}
+
+	void mem_copy_to(device_memory& mem)
+	{
+		if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+			tex_alloc(mem);
+		}
+		else if(mem.type == MEM_PIXELS) {
+			assert(!"mem_copy_to not supported for pixels.");
+		}
+		else {
+			if(!mem.device_pointer) {
+				mem_alloc(mem);
+			}
+
+			/* copy is no-op */
+		}
 	}
 
 	void mem_copy_from(device_memory& /*mem*/,
@@ -138,40 +339,83 @@ public:
 
 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.device_pointer, 0, mem.memory_size());
+		if(!mem.device_pointer) {
+			mem_alloc(mem);
+		}
+
+		if(mem.device_pointer) {
+			memset((void*)mem.device_pointer, 0, mem.memory_size());
+		}
 	}
 
 	void mem_free(device_memory& mem)
 	{
-		if(mem.device_pointer) {
+		if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+		}
+		else if(mem.device_pointer) {
+			if(mem.type == MEM_DEVICE_ONLY) {
+				util_aligned_free((void*)mem.device_pointer);
+			}
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		kernel_const_copy(&kernel_globals, name, host, size);
 	}
 
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType interpolation,
-	               ExtensionType extension)
+	void tex_alloc(device_memory& mem)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", "
+		VLOG(1) << "Texture allocate: " << mem.name << ", "
 		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 		        << string_human_readable_size(mem.memory_size()) << ")";
-		kernel_tex_copy(&kernel_globals,
-		                name,
-		                mem.data_pointer,
-		                mem.data_width,
-		                mem.data_height,
-		                mem.data_depth,
-		                interpolation,
-		                extension);
-		mem.device_pointer = mem.data_pointer;
+
+		if(mem.interpolation == INTERPOLATION_NONE) {
+			/* Data texture. */
+			kernel_tex_copy(&kernel_globals,
+							mem.name,
+							mem.host_pointer,
+							mem.data_size);
+		}
+		else {
+			/* Image Texture. */
+			int flat_slot = 0;
+			if(string_startswith(mem.name, "__tex_image")) {
+				int pos =  string(mem.name).rfind("_");
+				flat_slot = atoi(mem.name + pos + 1);
+			}
+			else {
+				assert(0);
+			}
+
+			if(flat_slot >= texture_info.size()) {
+				/* Allocate some slots in advance, to reduce amount
+				 * of re-allocations. */
+				texture_info.resize(flat_slot + 128);
+			}
+
+			TextureInfo& info = texture_info[flat_slot];
+			info.data = (uint64_t)mem.host_pointer;
+			info.cl_buffer = 0;
+			info.interpolation = mem.interpolation;
+			info.extension = mem.extension;
+			info.width = mem.data_width;
+			info.height = mem.data_height;
+			info.depth = mem.data_depth;
+
+			need_texture_info = true;
+		}
+
+		mem.device_pointer = (device_ptr)mem.host_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -182,6 +426,7 @@ public:
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
+			need_texture_info = true;
 		}
 	}
 
@@ -196,8 +441,9 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE)
-			thread_path_trace(*task);
+		if(task->type == DeviceTask::RENDER) {
+			thread_render(*task);
+		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -213,74 +459,327 @@ public:
 		}
 	};
 
-	void thread_path_trace(DeviceTask& task)
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		if(task_pool.canceled()) {
-			if(task.need_finish_queue == false)
-				return;
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
 		}
 
-		KernelGlobals kg = thread_kernel_globals_init();
-		RenderTile tile;
+		task->tiles_mem.copy_to_device();
 
-		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+		return true;
+	}
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			path_trace_kernel = kernel_cpu_avx2_path_trace;
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		int4 rect = task->rect;
+		int   r   = task->nlm_state.r;
+		int   f   = task->nlm_state.f;
+		float a   = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+
+		float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+		float *difference     = (float*) task->nlm_state.temporary_2_ptr;
+		float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
+
+		memset(weightAccum, 0, sizeof(float)*w*h);
+		memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) guide_ptr,
+			                                    (float*) variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    w, 0,
+			                                    a, k_2);
+
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+
+			filter_nlm_update_output_kernel()(dx, dy,
+			                                  blurDifference,
+			                                  (float*) image_ptr,
+			                                  (float*) out_ptr,
+			                                  weightAccum,
+			                                  local_rect,
+			                                  w, f);
 		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			path_trace_kernel = kernel_cpu_avx_path_trace;
+
+		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+		filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
+
+		return true;
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+				                                    x + task->filter_area.x,
+				                                    y + task->filter_area.y,
+				                                    y*task->filter_area.z + x,
+				                                    (float*) task->storage.transform.device_pointer,
+				                                    (int*)   task->storage.rank.device_pointer,
+				                                    &task->rect.x,
+				                                    task->buffer.pass_stride,
+				                                    task->radius,
+				                                    task->pca_threshold);
+			}
 		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			path_trace_kernel = kernel_cpu_sse41_path_trace;
+		return true;
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
+		float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) color_ptr,
+			                                    (float*) color_variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    task->buffer.stride,
+			                                    task->buffer.pass_stride,
+			                                    1.0f,
+			                                    task->nlm_k_2);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+			filter_nlm_construct_gramian_kernel()(dx, dy,
+			                                      blurDifference,
+			                                      (float*)  task->buffer.mem.device_pointer,
+			                                      (float*)  task->storage.transform.device_pointer,
+			                                      (int*)    task->storage.rank.device_pointer,
+			                                      (float*)  task->storage.XtWX.device_pointer,
+			                                      (float3*) task->storage.XtWY.device_pointer,
+			                                      local_rect,
+			                                      &task->reconstruction_state.filter_window.x,
+			                                      task->buffer.stride,
+			                                      4,
+			                                      task->buffer.pass_stride);
 		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			path_trace_kernel = kernel_cpu_sse3_path_trace;
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_finalize_kernel()(x,
+				                         y,
+				                         y*task->filter_area.z + x,
+				                         (float*)  output_ptr,
+				                         (int*)    task->storage.rank.device_pointer,
+				                         (float*)  task->storage.XtWX.device_pointer,
+				                         (float3*) task->storage.XtWY.device_pointer,
+				                         &task->reconstruction_state.buffer_params.x,
+				                         task->render_buffer.samples);
+			}
 		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			path_trace_kernel = kernel_cpu_sse2_path_trace;
+		return true;
+	}
+
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask * /*task*/)
+	{
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x; x < rect.z; x++) {
+				filter_combine_halves_kernel()(x, y,
+				                               (float*) mean_ptr,
+				                               (float*) variance_ptr,
+				                               (float*) a_ptr,
+				                               (float*) b_ptr,
+				                               &rect.x,
+				                               r);
+			}
 		}
-		else
-#endif
-		{
-			path_trace_kernel = kernel_cpu_path_trace;
+		return true;
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_divide_shadow_kernel()(task->render_buffer.samples,
+				                              task->tiles,
+				                              x, y,
+				                              (float*) a_ptr,
+				                              (float*) b_ptr,
+				                              (float*) sample_variance_ptr,
+				                              (float*) sv_variance_ptr,
+				                              (float*) buffer_variance_ptr,
+				                              &task->rect.x,
+				                              task->render_buffer.pass_stride,
+				                              task->render_buffer.denoising_data_offset);
+			}
 		}
-		
-		while(task.acquire_tile(this, tile)) {
-			float *render_buffer = (float*)tile.buffer;
-			uint *rng_state = (uint*)tile.rng_state;
-			int start_sample = tile.start_sample;
-			int end_sample = tile.start_sample + tile.num_samples;
-
-			for(int sample = start_sample; sample < end_sample; sample++) {
-				if(task.get_cancel() || task_pool.canceled()) {
-					if(task.need_finish_queue == false)
-						break;
-				}
+		return true;
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_get_feature_kernel()(task->render_buffer.samples,
+				                            task->tiles,
+				                            mean_offset,
+				                            variance_offset,
+				                            x, y,
+				                            (float*) mean_ptr,
+				                            (float*) variance_ptr,
+				                            &task->rect.x,
+				                            task->render_buffer.pass_stride,
+				                            task->render_buffer.denoising_data_offset);
+			}
+		}
+		return true;
+	}
+
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_detect_outliers_kernel()(x, y,
+				                                (float*) image_ptr,
+				                                (float*) variance_ptr,
+				                                (float*) depth_ptr,
+				                                (float*) output_ptr,
+				                                &task->rect.x,
+				                                task->buffer.pass_stride);
+			}
+		}
+		return true;
+	}
+
+	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+	{
+		scoped_timer timer(&tile.buffers->render_time);
+
+		float *render_buffer = (float*)tile.buffer;
+		int start_sample = tile.start_sample;
+		int end_sample = tile.start_sample + tile.num_samples;
+
+		for(int sample = start_sample; sample < end_sample; sample++) {
+			if(task.get_cancel() || task_pool.canceled()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
 
-				for(int y = tile.y; y < tile.y + tile.h; y++) {
-					for(int x = tile.x; x < tile.x + tile.w; x++) {
-						path_trace_kernel(&kg, render_buffer, rng_state,
-						                  sample, x, y, tile.offset, tile.stride);
-					}
+			for(int y = tile.y; y < tile.y + tile.h; y++) {
+				for(int x = tile.x; x < tile.x + tile.w; x++) {
+					path_trace_kernel()(kg, render_buffer,
+					                    sample, x, y, tile.offset, tile.stride);
 				}
+			}
+
+			tile.sample = sample + 1;
+
+			task.update_progress(&tile, tile.w*tile.h);
+		}
+	}
+
+	void denoise(DeviceTask &task, DenoisingTask& denoising, RenderTile &tile)
+	{
+		tile.sample = tile.start_sample + tile.num_samples;
+
+		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+		denoising.render_buffer.samples = tile.sample;
 
-				tile.sample = sample + 1;
+		RenderTile rtiles[9];
+		rtiles[4] = tile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
 
-				task.update_progress(&tile, tile.w*tile.h);
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+
+		task.update_progress(&tile, tile.w*tile.h);
+	}
+
+	void thread_render(DeviceTask& task)
+	{
+		if(task_pool.canceled()) {
+			if(task.need_finish_queue == false)
+				return;
+		}
+
+		/* allocate buffer for kernel globals */
+		device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
+		kgbuffer.alloc_to_device(1);
+
+		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
+
+		CPUSplitKernel *split_kernel = NULL;
+		if(use_split_kernel) {
+			split_kernel = new CPUSplitKernel(this);
+			if(!split_kernel->load_kernels(requested_features)) {
+				thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+				kgbuffer.free();
+				delete split_kernel;
+				return;
+			}
+		}
+
+		RenderTile tile;
+		DenoisingTask denoising(this);
+
+		while(task.acquire_tile(this, tile)) {
+			if(tile.task == RenderTile::PATH_TRACE) {
+				if(use_split_kernel) {
+					device_only_memory<uchar> void_buffer(this, "void_buffer");
+					split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+				}
+				else {
+					path_trace(task, tile, kg);
+				}
+			}
+			else if(tile.task == RenderTile::DENOISE) {
+				denoise(task, denoising, tile);
 			}
 
 			task.release_tile(tile);
@@ -291,7 +790,10 @@ public:
 			}
 		}
 
-		thread_kernel_globals_free(&kg);
+		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		kg->~KernelGlobals();
+		kgbuffer.free();
+		delete split_kernel;
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -299,86 +801,16 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
-			void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
-			}
-			else
-#endif	
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
-			if(system_cpu_support_sse3()) {
-				convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
-			}
-			else
-#endif
-			{
-				convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+					                               sample_scale, x, y, task.offset, task.stride);
 		}
 		else {
-			void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
-			}
-			else
-#endif			
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
-			}
-			else
-#endif
-			{
-				convert_to_byte_kernel = kernel_cpu_convert_to_byte;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+					                         sample_scale, x, y, task.offset, task.stride);
 
 		}
 	}
@@ -390,53 +822,16 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			shader_kernel = kernel_cpu_avx2_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			shader_kernel = kernel_cpu_avx_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-		if(system_cpu_support_sse41()) {
-			shader_kernel = kernel_cpu_sse41_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			shader_kernel = kernel_cpu_sse3_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			shader_kernel = kernel_cpu_sse2_shader;
-		}
-		else
-#endif
-		{
-			shader_kernel = kernel_cpu_shader;
-		}
-
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg,
-				              (uint4*)task.shader_input,
-				              (float4*)task.shader_output,
-				              (float*)task.shader_output_luma,
-				              task.shader_eval_type,
-				              task.shader_filter,
-				              x,
-				              task.offset,
-				              sample);
+				shader_kernel()(&kg,
+				                (uint4*)task.shader_input,
+				                (float4*)task.shader_output,
+				                task.shader_eval_type,
+				                task.shader_filter,
+				                x,
+				                task.offset,
+				                sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
@@ -453,20 +848,23 @@ public:
 	int get_split_task_count(DeviceTask& task)
 	{
 		if(task.type == DeviceTask::SHADER)
-			return task.get_subtask_count(TaskScheduler::num_threads(), 256);
+			return task.get_subtask_count(info.cpu_threads, 256);
 		else
-			return task.get_subtask_count(TaskScheduler::num_threads());
+			return task.get_subtask_count(info.cpu_threads);
 	}
 
 	void task_add(DeviceTask& task)
 	{
+		/* Load texture info. */
+		load_texture_info();
+
 		/* split task into smaller ones */
 		list<DeviceTask> tasks;
 
 		if(task.type == DeviceTask::SHADER)
-			task.split(tasks, TaskScheduler::num_threads(), 256);
+			task.split(tasks, info.cpu_threads, 256);
 		else
-			task.split(tasks, TaskScheduler::num_threads());
+			task.split(tasks, info.cpu_threads);
 
 		foreach(DeviceTask& task, tasks)
 			task_pool.push(new CPUDeviceTask(this, task));
@@ -501,6 +899,10 @@ protected:
 
 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
+		if(kg == NULL) {
+			return;
+		}
+
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -515,8 +917,121 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
+
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) {
+		requested_features = requested_features_;
+
+		return true;
+	}
 };
 
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+	CPUDevice* device;
+	void (*func)(KernelGlobals *kg, KernelData *data);
+
+	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+	~CPUSplitKernelFunction() {}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+	{
+		if(!func) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
+
+		return true;
+	}
+};
+
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                    RenderTile& rtile,
+                                                    int num_global_elements,
+                                                    device_memory& kernel_globals,
+                                                    device_memory& data,
+                                                    device_memory& split_data,
+                                                    device_memory& ray_state,
+                                                    device_memory& queue_index,
+                                                    device_memory& use_queues_flags,
+                                                    device_memory& work_pool_wgs)
+{
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+	for(int y = 0; y < dim.global_size[1]; y++) {
+		for(int x = 0; x < dim.global_size[0]; x++) {
+			kg->global_id = make_int2(x, y);
+
+			device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+			                           (KernelData*)data.device_pointer,
+			                           (void*)split_data.device_pointer,
+			                           num_global_elements,
+			                           (char*)ray_state.device_pointer,
+			                           rtile.start_sample,
+			                           rtile.start_sample + rtile.num_samples,
+			                           rtile.x,
+			                           rtile.y,
+			                           rtile.w,
+			                           rtile.h,
+			                           rtile.offset,
+			                           rtile.stride,
+			                           (int*)queue_index.device_pointer,
+			                           dim.global_size[0] * dim.global_size[1],
+			                           (char*)use_queues_flags.device_pointer,
+			                           (uint*)work_pool_wgs.device_pointer,
+			                           rtile.num_samples,
+			                           (float*)rtile.buffer);
+		}
+	}
+
+	return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                               const DeviceRequestedFeatures&)
+{
+	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+	kernel->func = device->split_kernels[kernel_name]();
+	if(!kernel->func) {
+		delete kernel;
+		return NULL;
+	}
+
+	return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+	return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
+	return make_int2(1, 1);
+}
+
+uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
@@ -531,7 +1046,13 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
-	info.pack_images = false;
+	info.bvh_layout_mask = BVH_LAYOUT_BVH2;
+	if (system_cpu_support_sse2()) {
+		info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
+	}
+	info.has_volume_decoupled = true;
+	info.has_osl = true;
+	info.has_half_images = true;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dafac6dfcb3..b4529feffa7 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,32 +15,40 @@
  */
 
 #include <climits>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_denoising.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
 
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
-#  include "util_opengl.h"
+#  include "util/util_opengl.h"
 #  include <cuda.h>
 #  include <cudaGL.h>
 #endif
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_time.h"
+
+#include "kernel/split/kernel_split_data_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -78,18 +86,71 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */
 
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+public:
+	CUDAContextScope(CUDADevice *device);
+	~CUDAContextScope();
+
+private:
+	CUDADevice *device;
+};
+
 class CUDADevice : public Device
 {
 public:
 	DedicatedTaskPool task_pool;
 	CUdevice cuDevice;
 	CUcontext cuContext;
-	CUmodule cuModule;
-	map<device_ptr, bool> tex_interp_map;
-	map<device_ptr, uint> tex_bindless_map;
+	CUmodule cuModule, cuFilterModule;
+	size_t device_texture_headroom;
+	size_t device_working_headroom;
+	bool move_texture_to_host;
+	size_t map_host_used;
+	size_t map_host_limit;
+	int can_map_host;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
+	CUDASplitKernel *split_kernel;
+
+	struct CUDAMem {
+		CUDAMem()
+		: texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}
+
+		CUtexObject texobject;
+		CUarray array;
+		void *map_host_pointer;
+		bool free_map_host;
+	};
+	typedef map<device_memory*, CUDAMem> CUDAMemMap;
+	CUDAMemMap cuda_mem_map;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -97,12 +158,11 @@ public:
 		GLuint cuTexId;
 		int w, h;
 	};
-
 	map<device_ptr, PixelMem> pixel_mem_map;
 
 	/* Bindless Textures */
-	device_vector<uint> bindless_mapping;
-	bool need_bindless_mapping;
+	device_vector<TextureInfo> texture_info;
+	bool need_texture_info;
 
 	CUdeviceptr cuda_device_ptr(device_ptr mem)
 	{
@@ -140,7 +200,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -172,18 +232,9 @@ public:
 		cuda_error_documentation();
 	}
 
-	void cuda_push_context()
-	{
-		cuda_assert(cuCtxSetCurrent(cuContext));
-	}
-
-	void cuda_pop_context()
-	{
-		cuda_assert(cuCtxSetCurrent(NULL));
-	}
-
 	CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
-	: Device(info, stats, background_)
+	: Device(info, stats, background_),
+	  texture_info(this, "__texture_info", MEM_TEXTURE)
 	{
 		first_error = true;
 		background = background_;
@@ -192,26 +243,50 @@ public:
 		cuDevice = 0;
 		cuContext = 0;
 
-		need_bindless_mapping = false;
+		cuModule = 0;
+		cuFilterModule = 0;
+
+		split_kernel = NULL;
+
+		need_texture_info = false;
+
+		device_texture_headroom = 0;
+		device_working_headroom = 0;
+		move_texture_to_host = false;
+		map_host_limit = 0;
+		map_host_used = 0;
+		can_map_host = 0;
 
-		/* intialize */
+		/* Intialize CUDA. */
 		if(cuda_error(cuInit(0)))
 			return;
 
-		/* setup device and context */
+		/* Setup device and context. */
 		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
 			return;
 
+		/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+		 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+		 * so we can predict which memory to map to host. */
+		cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+		unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+		if(can_map_host) {
+			ctx_flags |= CU_CTX_MAP_HOST;
+			init_host_memory();
+		}
+
+		/* Create context. */
 		CUresult result;
 
 		if(background) {
-			result = cuCtxCreate(&cuContext, 0, cuDevice);
+			result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
 		}
 		else {
-			result = cuGLCtxCreate(&cuContext, 0, cuDevice);
+			result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
 
 			if(result != CUDA_SUCCESS) {
-				result = cuCtxCreate(&cuContext, 0, cuDevice);
+				result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
 				background = true;
 			}
 		}
@@ -224,16 +299,17 @@ public:
 		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;
 
-		cuda_pop_context();
+		/* Pop context set by cuCtxCreate. */
+		cuCtxPopCurrent(NULL);
 	}
 
 	~CUDADevice()
 	{
 		task_pool.stop();
 
-		if(info.has_bindless_textures) {
-			tex_free(bindless_mapping);
-		}
+		delete split_kernel;
+
+		texture_info.free();
 
 		cuda_assert(cuCtxDestroy(cuContext));
 	}
@@ -244,9 +320,9 @@ public:
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
 		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 
-		/* We only support sm_20 and above */
-		if(major < 2) {
-			cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
+		/* We only support sm_30 and above */
+		if(major < 3) {
+			cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor));
 			return false;
 		}
 
@@ -258,26 +334,29 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}
 
+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features,
+	        bool filter=false, bool split=false)
 	{
-		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
-		const string kernel_path = path_get("kernel");
-		const string include = kernel_path;
+		const string source_path = path_get("source");
+		const string include_path = source_path;
 		string cflags = string_printf("-m%d "
 		                              "--ptxas-options=\"-v\" "
 		                              "--use_fast_math "
 		                              "-DNVCC "
-		                              "-D__KERNEL_CUDA_VERSION__=%d "
 		                               "-I\"%s\"",
 		                              machine,
-		                              cuda_version,
-		                              include.c_str());
-		if(use_adaptive_compilation()) {
+		                              include_path.c_str());
+		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
 		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -287,6 +366,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}
 
@@ -306,22 +390,36 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return false;
 		}
-		if(cuda_version < 75) {
+		if(cuda_version < 80) {
 			printf("Unsupported CUDA version %d.%d detected, "
-			       "you need CUDA 7.5 or newer.\n",
+			       "you need CUDA 8.0 or newer.\n",
 			       major, minor);
 			return false;
 		}
-		else if(cuda_version != 75 && cuda_version != 80) {
+		else if(cuda_version != 80) {
 			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 7.5 and 8.0 are officially supported.\n",
+			       "CUDA 8.0 is officially supported.\n",
 			       major, minor);
 		}
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features,
+	                      bool filter=false, bool split=false)
 	{
+		const char *name, *source;
+		if(filter) {
+			name = "filter";
+			source = "filter.cu";
+		}
+		else if(split) {
+			name = "kernel_split";
+			source = "kernel_split.cu";
+		}
+		else {
+			name = "kernel";
+			source = "kernel.cu";
+		}
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -329,8 +427,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
-			                                            major, minor));
+			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+			                                            name, major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
 				VLOG(1) << "Using precompiled kernel.";
@@ -339,19 +437,19 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, filter, split);
 
 		/* Try to use locally compiled kernel. */
-		const string kernel_path = path_get("kernel");
-		const string kernel_md5 = path_files_md5_hash(kernel_path);
+		const string source_path = path_get("source");
+		const string kernel_md5 = path_files_md5_hash(source_path);
 
 		/* We include cflags into md5 so changing cuda toolkit or changing other
 		 * compiler command line arguments makes sure cubin gets re-built.
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
-		                                        major, minor,
+		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+		                                        name, major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -362,9 +460,9 @@ public:
 
 #ifdef _WIN32
 		if(have_precompiled_kernels()) {
-			if(major < 2) {
+			if(major < 3) {
 				cuda_error_message(string_printf(
-				        "CUDA device requires compute capability 2.0 or up, "
+				        "CUDA device requires compute capability 3.0 or up, "
 				        "found %d.%d. Your GPU is not supported.",
 				        major, minor));
 			}
@@ -383,9 +481,10 @@ public:
 			return "";
 		}
 		const char *nvcc = cuewCompilerPath();
-		const string kernel = path_join(kernel_path,
-		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		const string kernel = path_join(
+		        path_join(source_path, "kernel"),
+		        path_join("kernels",
+		                  path_join("cuda", source)));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -424,6 +523,16 @@ public:
 
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
+		/* TODO(sergey): Support kernels re-load for CUDA devices.
+		 *
+		 * Currently re-loading kernel will invalidate memory pointers,
+		 * causing problems in cuCtxSynchronize.
+		 */
+		if(cuFilterModule && cuModule) {
+			VLOG(1) << "Skipping kernel reload, not currently supported.";
+			return true;
+		}
+
 		/* check if cuda init succeeded */
 		if(cuContext == 0)
 			return false;
@@ -433,13 +542,16 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
-
+		string cubin = compile_kernel(requested_features, false, use_split_kernel());
 		if(cubin == "")
 			return false;
 
+		string filter_cubin = compile_kernel(requested_features, true, false);
+		if(filter_cubin == "")
+			return false;
+
 		/* open module */
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		string cubin_data;
 		CUresult result;
@@ -452,112 +564,465 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
 
-		cuda_pop_context();
+		if(path_read_text(filter_cubin, cubin_data))
+			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+		else
+			result = CUDA_ERROR_FILE_NOT_FOUND;
+
+		if(cuda_error_(result, "cuModuleLoad"))
+			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+		if(result == CUDA_SUCCESS) {
+			reserve_local_memory(requested_features);
+		}
 
 		return (result == CUDA_SUCCESS);
 	}
 
-	void load_bindless_mapping()
+	void reserve_local_memory(const DeviceRequestedFeatures& requested_features)
+	{
+		if(use_split_kernel()) {
+			/* Split kernel mostly uses global memory and adaptive compilation,
+			 * difficult to predict how much is needed currently. */
+			return;
+		}
+
+		/* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+		 * needed for kernel launches, so that we can reliably figure out when
+		 * to allocate scene data in mapped host memory. */
+		CUDAContextScope scope(this);
+
+		size_t total = 0, free_before = 0, free_after = 0;
+		cuMemGetInfo(&free_before, &total);
+
+		/* Get kernel function. */
+		CUfunction cuPathTrace;
+
+		if(requested_features.use_integrator_branched) {
+			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+		}
+		else {
+			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+		}
+
+		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+		int min_blocks, num_threads_per_block;
+		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+		/* Launch kernel, using just 1 block appears sufficient to reserve
+		 * memory for all multiprocessors. It would be good to do this in
+		 * parallel for the multi GPU case still to make it faster. */
+		CUdeviceptr d_work_tiles = 0;
+		uint total_work_size = 0;
+
+		void *args[] = {&d_work_tiles,
+		                &total_work_size};
+
+		cuda_assert(cuLaunchKernel(cuPathTrace,
+		                           1, 1, 1,
+		                           num_threads_per_block, 1, 1,
+		                           0, 0, args, 0));
+
+		cuda_assert(cuCtxSynchronize());
+
+		cuMemGetInfo(&free_after, &total);
+		VLOG(1) << "Local memory reserved "
+		        << string_human_readable_number(free_before - free_after) << " bytes. ("
+		        << string_human_readable_size(free_before - free_after) << ")";
+
+#if 0
+		/* For testing mapped host memory, fill up device memory. */
+		const size_t keep_mb = 1024;
+
+		while(free_after > keep_mb * 1024 * 1024LL) {
+			CUdeviceptr tmp;
+			cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+			cuMemGetInfo(&free_after, &total);
+		}
+#endif
+	}
+
+	void init_host_memory()
 	{
-		if(info.has_bindless_textures && need_bindless_mapping) {
-			tex_free(bindless_mapping);
-			tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
-			need_bindless_mapping = false;
+		/* Limit amount of host mapped memory, because allocating too much can
+		 * cause system instability. Leave at least half or 4 GB of system
+		 * memory free, whichever is smaller. */
+		size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+		size_t system_ram = system_physical_ram();
+
+		if(system_ram > 0) {
+			if(system_ram / 2 > default_limit) {
+				map_host_limit = system_ram - default_limit;
+			}
+			else {
+				map_host_limit = system_ram / 2;
+			}
+		}
+		else {
+			VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+			map_host_limit = 0;
 		}
+
+		/* Amount of device memory to keep is free after texture memory
+		 * and working memory allocations respectively. We set the working
+		 * memory limit headroom lower so that some space is left after all
+		 * texture memory allocations. */
+		device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+		device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+		VLOG(1) << "Mapped host memory limit set to "
+		        << string_human_readable_number(map_host_limit) << " bytes. ("
+		        << string_human_readable_size(map_host_limit) << ")";
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void load_texture_info()
 	{
-		cuda_push_context();
-		CUdeviceptr device_pointer;
-		size_t size = mem.memory_size();
-		cuda_assert(cuMemAlloc(&device_pointer, size));
+		if(need_texture_info) {
+			texture_info.copy_to_device();
+			need_texture_info = false;
+		}
+	}
+
+	void move_textures_to_host(size_t size, bool for_texture)
+	{
+		/* Signal to reallocate textures in host memory only. */
+		move_texture_to_host = true;
+
+		while(size > 0) {
+			/* Find suitable memory allocation to move. */
+			device_memory *max_mem = NULL;
+			size_t max_size = 0;
+			bool max_is_image = false;
+
+			foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
+				device_memory& mem = *pair.first;
+				CUDAMem *cmem = &pair.second;
+
+				bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+				bool is_image = is_texture && (mem.data_height > 1);
+
+				/* Can't move this type of memory. */
+				if(!is_texture || cmem->array) {
+					continue;
+				}
+
+				/* Already in host memory. */
+				if(cmem->map_host_pointer) {
+					continue;
+				}
+
+				/* For other textures, only move image textures. */
+				if(for_texture && !is_image) {
+					continue;
+				}
+
+				/* Try to move largest allocation, prefer moving images. */
+				if(is_image > max_is_image ||
+				   (is_image == max_is_image && mem.device_size > max_size)) {
+					max_is_image = is_image;
+					max_size = mem.device_size;
+					max_mem = &mem;
+				}
+			}
+
+			/* Move to host memory. This part is mutex protected since
+			 * multiple CUDA devices could be moving the memory. The
+			 * first one will do it, and the rest will adopt the pointer. */
+			if(max_mem) {
+				VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+				static thread_mutex move_mutex;
+				thread_scoped_lock lock(move_mutex);
+
+				/* Preserve the original device pointer, in case of multi device
+				 * we can't change it because the pointer mapping would break. */
+				device_ptr prev_pointer = max_mem->device_pointer;
+				size_t prev_size = max_mem->device_size;
+
+				tex_free(*max_mem);
+				tex_alloc(*max_mem);
+				size = (max_size >= size)? 0: size - max_size;
+
+				max_mem->device_pointer = prev_pointer;
+				max_mem->device_size = prev_size;
+			}
+			else {
+				break;
+			}
+		}
+
+		/* Update texture info array with new pointers. */
+		load_texture_info();
+
+		move_texture_to_host = false;
+	}
+
+	CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
+	{
+		CUDAContextScope scope(this);
+
+		CUdeviceptr device_pointer = 0;
+		size_t size = mem.memory_size() + pitch_padding;
+
+		CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+		const char *status = "";
+
+		/* First try allocating in device memory, respecting headroom. We make
+		 * an exception for texture info. It is small and frequently accessed,
+		 * so treat it as working memory.
+		 *
+		 * If there is not enough room for working memory, we will try to move
+		 * textures to host memory, assuming the performance impact would have
+		 * been worse for working memory. */
+		bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+		bool is_image = is_texture && (mem.data_height > 1);
+
+		size_t headroom = (is_texture)? device_texture_headroom:
+		                                device_working_headroom;
+
+		size_t total = 0, free = 0;
+		cuMemGetInfo(&free, &total);
+
+		/* Move textures to host memory if needed. */
+		if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
+			move_textures_to_host(size + headroom - free, is_texture);
+			cuMemGetInfo(&free, &total);
+		}
+
+		/* Allocate in device memory. */
+		if(!move_texture_to_host && (size + headroom) < free) {
+			mem_alloc_result = cuMemAlloc(&device_pointer, size);
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				status = " in device memory";
+			}
+		}
+
+		/* Fall back to mapped host memory if needed and possible. */
+		void *map_host_pointer = 0;
+		bool free_map_host = false;
+
+		if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+		   map_host_used + size < map_host_limit) {
+			if(mem.shared_pointer) {
+				/* Another device already allocated host memory. */
+				mem_alloc_result = CUDA_SUCCESS;
+				map_host_pointer = mem.shared_pointer;
+			}
+			else {
+				/* Allocate host memory ourselves. */
+				mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
+				                                  CU_MEMHOSTALLOC_DEVICEMAP |
+				                                  CU_MEMHOSTALLOC_WRITECOMBINED);
+				mem.shared_pointer = map_host_pointer;
+				free_map_host = true;
+			}
+
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+				map_host_used += size;
+				status = " in host memory";
+
+				/* Replace host pointer with our host allocation. Only works if
+				 * CUDA memory layout is the same and has no pitch padding. Also
+				 * does not work if we move textures to host during a render,
+				 * since other devices might be using the memory. */
+				if(!move_texture_to_host && pitch_padding == 0 &&
+				   mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
+					memcpy(mem.shared_pointer, mem.host_pointer, size);
+					mem.host_free();
+					mem.host_pointer = mem.shared_pointer;
+				}
+			}
+			else {
+				status = " failed, out of host memory";
+			}
+		}
+		else if(mem_alloc_result != CUDA_SUCCESS) {
+			status = " failed, out of device and host memory";
+		}
+
+		if(mem_alloc_result != CUDA_SUCCESS) {
+			cuda_assert(mem_alloc_result);
+		}
+
+		if(mem.name) {
+			VLOG(1) << "Buffer allocate: " << mem.name << ", "
+					<< string_human_readable_number(mem.memory_size()) << " bytes. ("
+					<< string_human_readable_size(mem.memory_size()) << ")"
+					<< status;
+		}
+
 		mem.device_pointer = (device_ptr)device_pointer;
 		mem.device_size = size;
 		stats.mem_alloc(size);
-		cuda_pop_context();
+
+		if(!mem.device_pointer) {
+			return NULL;
+		}
+
+		/* Insert into map of allocations. */
+		CUDAMem *cmem = &cuda_mem_map[&mem];
+		cmem->map_host_pointer = map_host_pointer;
+		cmem->free_map_host = free_map_host;
+		return cmem;
+	}
+
+	void generic_copy_to(device_memory& mem)
+	{
+		if(mem.host_pointer && mem.device_pointer) {
+			CUDAContextScope scope(this);
+
+			if(mem.host_pointer != mem.shared_pointer) {
+				cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
+				                         mem.host_pointer,
+				                         mem.memory_size()));
+			}
+		}
+	}
+
+	void generic_free(device_memory& mem)
+	{
+		if(mem.device_pointer) {
+			CUDAContextScope scope(this);
+			const CUDAMem& cmem = cuda_mem_map[&mem];
+
+			if(cmem.map_host_pointer) {
+				/* Free host memory. */
+				if(cmem.free_map_host) {
+					cuMemFreeHost(cmem.map_host_pointer);
+					if(mem.host_pointer == mem.shared_pointer) {
+						mem.host_pointer = 0;
+					}
+					mem.shared_pointer = 0;
+				}
+
+				map_host_used -= mem.device_size;
+			}
+			else {
+				/* Free device memory. */
+				cuMemFree(mem.device_pointer);
+			}
+
+			stats.mem_free(mem.device_size);
+			mem.device_pointer = 0;
+			mem.device_size = 0;
+
+			cuda_mem_map.erase(cuda_mem_map.find(&mem));
+		}
+	}
+
+	void mem_alloc(device_memory& mem)
+	{
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_alloc(mem);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			assert(!"mem_alloc not supported for textures.");
+		}
+		else {
+			generic_alloc(mem);
+		}
 	}
 
 	void mem_copy_to(device_memory& mem)
 	{
-		cuda_push_context();
-		if(mem.device_pointer)
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
-		cuda_pop_context();
+		if(mem.type == MEM_PIXELS) {
+			assert(!"mem_copy_to not supported for pixels.");
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+			tex_alloc(mem);
+		}
+		else {
+			if(!mem.device_pointer) {
+				generic_alloc(mem);
+			}
+
+			generic_copy_to(mem);
+		}
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 	{
-		size_t offset = elem*y*w;
-		size_t size = elem*w*h;
-
-		cuda_push_context();
-		if(mem.device_pointer) {
-			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
-			                         (CUdeviceptr)(mem.device_pointer + offset), size));
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_copy_from(mem, y, w, h);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			assert(!"mem_copy_from not supported for textures.");
 		}
 		else {
-			memset((char*)mem.data_pointer + offset, 0, size);
+			CUDAContextScope scope(this);
+			size_t offset = elem*y*w;
+			size_t size = elem*w*h;
+
+			if(mem.host_pointer && mem.device_pointer) {
+				cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset,
+										 (CUdeviceptr)(mem.device_pointer + offset), size));
+			}
+			else if(mem.host_pointer) {
+				memset((char*)mem.host_pointer + offset, 0, size);
+			}
 		}
-		cuda_pop_context();
 	}
 
 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(!mem.device_pointer) {
+			mem_alloc(mem);
+		}
 
-		cuda_push_context();
-		if(mem.device_pointer)
+		if(mem.host_pointer) {
+			memset(mem.host_pointer, 0, mem.memory_size());
+		}
+
+		if(mem.device_pointer &&
+		   (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
+			CUDAContextScope scope(this);
 			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-		cuda_pop_context();
+		}
 	}
 
 	void mem_free(device_memory& mem)
 	{
-		if(mem.device_pointer) {
-			cuda_push_context();
-			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
-			cuda_pop_context();
-
-			mem.device_pointer = 0;
-
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_free(mem);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+		}
+		else {
+			generic_free(mem);
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
+		CUDAContextScope scope(this);
 		CUdeviceptr mem;
 		size_t bytes;
 
-		cuda_push_context();
 		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
 		//assert(bytes == size);
 		cuda_assert(cuMemcpyHtoD(mem, host, size));
-		cuda_pop_context();
 	}
 
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType interpolation,
-	               ExtensionType extension)
+	void tex_alloc(device_memory& mem)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
-		/* Check if we are on sm_30 or above.
-		 * We use arrays and bindles textures for storage there */
-		bool has_bindless_textures = info.has_bindless_textures;
+		CUDAContextScope scope(this);
 
 		/* General variables for both architectures */
-		string bind_name = name;
+		string bind_name = mem.name;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
 
 		CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-		switch(extension) {
+		switch(mem.extension) {
 			case EXTENSION_REPEAT:
 				address_mode = CU_TR_ADDRESS_MODE_WRAP;
 				break;
@@ -573,13 +1038,37 @@ public:
 		}
 
 		CUfilter_mode filter_mode;
-		if(interpolation == INTERPOLATION_CLOSEST) {
+		if(mem.interpolation == INTERPOLATION_CLOSEST) {
 			filter_mode = CU_TR_FILTER_MODE_POINT;
 		}
 		else {
 			filter_mode = CU_TR_FILTER_MODE_LINEAR;
 		}
 
+		/* Data Storage */
+		if(mem.interpolation == INTERPOLATION_NONE) {
+			generic_alloc(mem);
+			generic_copy_to(mem);
+
+			CUdeviceptr cumem;
+			size_t cubytes;
+
+			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+
+			if(cubytes == 8) {
+				/* 64 bit device pointer */
+				uint64_t ptr = mem.device_pointer;
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+			}
+			else {
+				/* 32 bit device pointer */
+				uint32_t ptr = (uint32_t)mem.device_pointer;
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+			}
+			return;
+		}
+
+		/* Image Texture Storage */
 		CUarray_format_enum format;
 		switch(mem.data_type) {
 			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
@@ -590,300 +1079,647 @@ public:
 			default: assert(0); return;
 		}
 
-		/* General variables for Fermi */
-		CUtexref texref = NULL;
+		CUDAMem *cmem = NULL;
+		CUarray array_3d = NULL;
+		size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+		size_t dst_pitch = src_pitch;
 
-		if(!has_bindless_textures) {
-			if(mem.data_depth > 1) {
-				/* Kernel uses different bind names for 2d and 3d float textures,
-				 * so we have to adjust couple of things here.
-				 */
-				vector<string> tokens;
-				string_split(tokens, name, "_");
-				bind_name = string_printf("__tex_image_%s_3d_%s",
-				                          tokens[2].c_str(),
-				                          tokens[3].c_str());
-			}
+		if(mem.data_depth > 1) {
+			/* 3D texture using array, there is no API for linear memory. */
+			CUDA_ARRAY3D_DESCRIPTOR desc;
 
-			cuda_push_context();
-			cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
-			cuda_pop_context();
+			desc.Width = mem.data_width;
+			desc.Height = mem.data_height;
+			desc.Depth = mem.data_depth;
+			desc.Format = format;
+			desc.NumChannels = mem.data_elements;
+			desc.Flags = 0;
 
-			if(!texref) {
-				return;
-			}
-		}
+			VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
 
-		/* Data Storage */
-		if(interpolation == INTERPOLATION_NONE) {
-			if(has_bindless_textures) {
-				mem_alloc(mem, MEM_READ_ONLY);
-				mem_copy_to(mem);
+			cuda_assert(cuArray3DCreate(&array_3d, &desc));
 
-				cuda_push_context();
+			if(!array_3d) {
+				return;
+			}
 
-				CUdeviceptr cumem;
-				size_t cubytes;
+			CUDA_MEMCPY3D param;
+			memset(&param, 0, sizeof(param));
+			param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+			param.dstArray = array_3d;
+			param.srcMemoryType = CU_MEMORYTYPE_HOST;
+			param.srcHost = mem.host_pointer;
+			param.srcPitch = src_pitch;
+			param.WidthInBytes = param.srcPitch;
+			param.Height = mem.data_height;
+			param.Depth = mem.data_depth;
 
-				cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+			cuda_assert(cuMemcpy3D(&param));
 
-				if(cubytes == 8) {
-					/* 64 bit device pointer */
-					uint64_t ptr = mem.device_pointer;
-					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-				}
-				else {
-					/* 32 bit device pointer */
-					uint32_t ptr = (uint32_t)mem.device_pointer;
-					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-				}
+			mem.device_pointer = (device_ptr)array_3d;
+			mem.device_size = size;
+			stats.mem_alloc(size);
 
-				cuda_pop_context();
+			cmem = &cuda_mem_map[&mem];
+			cmem->texobject = 0;
+			cmem->array = array_3d;
+		}
+		else if(mem.data_height > 0) {
+			/* 2D texture, using pitch aligned linear memory. */
+			int alignment = 0;
+			cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+			dst_pitch = align_up(src_pitch, alignment);
+			size_t dst_size = dst_pitch * mem.data_height;
+
+			cmem = generic_alloc(mem, dst_size - mem.memory_size());
+			if(!cmem) {
+				return;
 			}
-			else {
-				mem_alloc(mem, MEM_READ_ONLY);
-				mem_copy_to(mem);
 
-				cuda_push_context();
+			CUDA_MEMCPY2D param;
+			memset(&param, 0, sizeof(param));
+			param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+			param.dstDevice = mem.device_pointer;
+			param.dstPitch = dst_pitch;
+			param.srcMemoryType = CU_MEMORYTYPE_HOST;
+			param.srcHost = mem.host_pointer;
+			param.srcPitch = src_pitch;
+			param.WidthInBytes = param.srcPitch;
+			param.Height = mem.data_height;
+
+			cuda_assert(cuMemcpy2DUnaligned(&param));
+		}
+		else {
+			/* 1D texture, using linear memory. */
+			cmem = generic_alloc(mem);
+			if(!cmem) {
+				return;
+			}
 
-				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
-				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+			cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+		}
 
-				cuda_pop_context();
-			}
+		/* Kepler+, bindless textures. */
+		int flat_slot = 0;
+		if(string_startswith(mem.name, "__tex_image")) {
+			int pos =  string(mem.name).rfind("_");
+			flat_slot = atoi(mem.name + pos + 1);
 		}
-		/* Texture Storage */
 		else {
-			CUarray handle = NULL;
+			assert(0);
+		}
 
-			cuda_push_context();
+		CUDA_RESOURCE_DESC resDesc;
+		memset(&resDesc, 0, sizeof(resDesc));
 
-			if(mem.data_depth > 1) {
-				CUDA_ARRAY3D_DESCRIPTOR desc;
+		if(array_3d) {
+			resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+			resDesc.res.array.hArray = array_3d;
+			resDesc.flags = 0;
+		}
+		else if(mem.data_height > 0) {
+			resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+			resDesc.res.pitch2D.devPtr = mem.device_pointer;
+			resDesc.res.pitch2D.format = format;
+			resDesc.res.pitch2D.numChannels = mem.data_elements;
+			resDesc.res.pitch2D.height = mem.data_height;
+			resDesc.res.pitch2D.width = mem.data_width;
+			resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+		}
+		else {
+			resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+			resDesc.res.linear.devPtr = mem.device_pointer;
+			resDesc.res.linear.format = format;
+			resDesc.res.linear.numChannels = mem.data_elements;
+			resDesc.res.linear.sizeInBytes = mem.device_size;
+		}
 
-				desc.Width = mem.data_width;
-				desc.Height = mem.data_height;
-				desc.Depth = mem.data_depth;
-				desc.Format = format;
-				desc.NumChannels = mem.data_elements;
-				desc.Flags = 0;
+		CUDA_TEXTURE_DESC texDesc;
+		memset(&texDesc, 0, sizeof(texDesc));
+		texDesc.addressMode[0] = address_mode;
+		texDesc.addressMode[1] = address_mode;
+		texDesc.addressMode[2] = address_mode;
+		texDesc.filterMode = filter_mode;
+		texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+		cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+		/* Resize once */
+		if(flat_slot >= texture_info.size()) {
+			/* Allocate some slots in advance, to reduce amount
+			 * of re-allocations. */
+			texture_info.resize(flat_slot + 128);
+		}
 
-				cuda_assert(cuArray3DCreate(&handle, &desc));
-			}
-			else {
-				CUDA_ARRAY_DESCRIPTOR desc;
+		/* Set Mapping and tag that we need to (re-)upload to device */
+		TextureInfo& info = texture_info[flat_slot];
+		info.data = (uint64_t)cmem->texobject;
+		info.cl_buffer = 0;
+		info.interpolation = mem.interpolation;
+		info.extension = mem.extension;
+		info.width = mem.data_width;
+		info.height = mem.data_height;
+		info.depth = mem.data_depth;
+		need_texture_info = true;
+	}
 
-				desc.Width = mem.data_width;
-				desc.Height = mem.data_height;
-				desc.Format = format;
-				desc.NumChannels = mem.data_elements;
+	void tex_free(device_memory& mem)
+	{
+		if(mem.device_pointer) {
+			CUDAContextScope scope(this);
+			const CUDAMem& cmem = cuda_mem_map[&mem];
 
-				cuda_assert(cuArrayCreate(&handle, &desc));
+			if(cmem.texobject) {
+				/* Free bindless texture. */
+				cuTexObjectDestroy(cmem.texobject);
 			}
 
-			if(!handle) {
-				cuda_pop_context();
-				return;
-			}
+			if(cmem.array) {
+				/* Free array. */
+				cuArrayDestroy(cmem.array);
+				stats.mem_free(mem.device_size);
+				mem.device_pointer = 0;
+				mem.device_size = 0;
 
-			/* Allocate 3D, 2D or 1D memory */
-			if(mem.data_depth > 1) {
-				CUDA_MEMCPY3D param;
-				memset(&param, 0, sizeof(param));
-				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-				param.dstArray = handle;
-				param.srcMemoryType = CU_MEMORYTYPE_HOST;
-				param.srcHost = (void*)mem.data_pointer;
-				param.srcPitch = mem.data_width*dsize*mem.data_elements;
-				param.WidthInBytes = param.srcPitch;
-				param.Height = mem.data_height;
-				param.Depth = mem.data_depth;
-
-				cuda_assert(cuMemcpy3D(&param));
+				cuda_mem_map.erase(cuda_mem_map.find(&mem));
 			}
-			else if(mem.data_height > 1) {
-				CUDA_MEMCPY2D param;
-				memset(&param, 0, sizeof(param));
-				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-				param.dstArray = handle;
-				param.srcMemoryType = CU_MEMORYTYPE_HOST;
-				param.srcHost = (void*)mem.data_pointer;
-				param.srcPitch = mem.data_width*dsize*mem.data_elements;
-				param.WidthInBytes = param.srcPitch;
-				param.Height = mem.data_height;
-
-				cuda_assert(cuMemcpy2D(&param));
+			else {
+				generic_free(mem);
 			}
-			else
-				cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+		}
+	}
 
-			/* Fermi and Kepler */
-			mem.device_pointer = (device_ptr)handle;
-			mem.device_size = size;
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+	{
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
+		}
 
-			stats.mem_alloc(size);
+		task->tiles_mem.copy_to_device();
 
-			/* Bindless Textures - Kepler */
-			if(has_bindless_textures) {
-				int flat_slot = 0;
-				if(string_startswith(name, "__tex_image")) {
-					int pos =  string(name).rfind("_");
-					flat_slot = atoi(name + pos + 1);
-				}
-				else {
-					assert(0);
-				}
+		return !have_error();
+	}
 
-				CUDA_RESOURCE_DESC resDesc;
-				memset(&resDesc, 0, sizeof(resDesc));
-				resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-				resDesc.res.array.hArray = handle;
-				resDesc.flags = 0;
-
-				CUDA_TEXTURE_DESC texDesc;
-				memset(&texDesc, 0, sizeof(texDesc));
-				texDesc.addressMode[0] = address_mode;
-				texDesc.addressMode[1] = address_mode;
-				texDesc.addressMode[2] = address_mode;
-				texDesc.filterMode = filter_mode;
-				texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-				CUtexObject tex = 0;
-				cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
-
-				/* Safety check */
-				if((uint)tex > UINT_MAX) {
-					assert(0);
-				}
+#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int threads = (int)sqrt((float)threads_per_block);                                                  \
+			int xblocks = ((w) + threads - 1)/threads;                                                          \
+			int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args)                      \
+			cuda_assert(cuLaunchKernel(func,                \
+			                           xblocks, yblocks, 1, \
+			                           threads, threads, 1, \
+			                           0, 0, args, 0));
+
+/* Similar as above, but for 1-dimensional blocks. */
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
+			int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
+			cuda_assert(cuLaunchKernel(func,                    \
+			                           xblocks, yblocks, 1,     \
+			                           threads_per_block, 1, 1, \
+			                           0, 0, args, 0));
+
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
 
-				/* Resize once */
-				if(flat_slot >= bindless_mapping.size()) {
-					/* Allocate some slots in advance, to reduce amount
-					 * of re-allocations.
-					 */
-					bindless_mapping.resize(flat_slot + 128);
-				}
+		CUDAContextScope scope(this);
 
-				/* Set Mapping and tag that we need to (re-)upload to device */
-				bindless_mapping.get_data()[flat_slot] = (uint)tex;
-				tex_bindless_map[mem.device_pointer] = (uint)tex;
-				need_bindless_mapping = true;
-			}
-			/* Regular Textures - Fermi */
-			else {
-				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
-				cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-			}
+		int stride = task->buffer.stride;
+		int w = task->buffer.width;
+		int h = task->buffer.h;
+		int r = task->nlm_state.r;
+		int f = task->nlm_state.f;
+		float a = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
 
-			cuda_pop_context();
-		}
+		int shift_stride = stride*h;
+		int num_shifts = (2*r+1)*(2*r+1);
+		int mem_size = sizeof(float)*shift_stride*num_shifts;
+		int channel_offset = 0;
 
-		/* Fermi, Data and Image Textures */
-		if(!has_bindless_textures) {
-			cuda_push_context();
+		device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
+		temporary_mem.alloc_to_device(2*mem_size);
 
-			cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
-			cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
-			if(mem.data_depth > 1) {
-				cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
-			}
+		if(have_error())
+			return false;
+
+		CUdeviceptr difference     = cuda_device_ptr(temporary_mem.device_pointer);
+		CUdeviceptr blurDifference = difference + mem_size;
 
-			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+		CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
 
-			cuda_pop_context();
+		{
+			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+			cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+			cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
+
+			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
+
+			void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f};
+
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
 		}
 
-		/* Fermi and Kepler */
-		tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
+		temporary_mem.free();
+
+		{
+			CUfunction cuNLMNormalize;
+			cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+			void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+			CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+			CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+			cuda_assert(cuCtxSynchronize());
+		}
+
+		return !have_error();
 	}
 
-	void tex_free(device_memory& mem)
+	bool denoising_construct_transform(DenoisingTask *task)
 	{
-		if(mem.device_pointer) {
-			if(tex_interp_map[mem.device_pointer]) {
-				cuda_push_context();
-				cuArrayDestroy((CUarray)mem.device_pointer);
-				cuda_pop_context();
-
-				/* Free CUtexObject (Bindless Textures) */
-				if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
-					uint flat_slot = tex_bindless_map[mem.device_pointer];
-					cuTexObjectDestroy(flat_slot);
-				}
+		if(have_error())
+			return false;
 
-				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
-				mem.device_pointer = 0;
+		CUDAContextScope scope(this);
+
+		CUfunction cuFilterConstructTransform;
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+		                   task->storage.w,
+		                   task->storage.h);
+
+		void *args[] = {&task->buffer.mem.device_pointer,
+		                &task->storage.transform.device_pointer,
+		                &task->storage.rank.device_pointer,
+		                &task->filter_area,
+		                &task->rect,
+		                &task->radius,
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
+		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+		cuda_assert(cuCtxSynchronize());
 
-				stats.mem_free(mem.device_size);
-				mem.device_size = 0;
-			}
-			else {
-				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
-				mem_free(mem);
-			}
+		return !have_error();
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		CUDAContextScope scope(this);
+
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		float k_2 = task->nlm_k_2;
+
+		int w = task->reconstruction_state.source_w;
+		int h = task->reconstruction_state.source_h;
+		int stride = task->buffer.stride;
+
+		int shift_stride = stride*h;
+		int num_shifts = (2*r+1)*(2*r+1);
+		int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+		device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
+		temporary_mem.alloc_to_device(2*mem_size);
+
+		if(have_error())
+			return false;
+
+		CUdeviceptr difference     = cuda_device_ptr(temporary_mem.device_pointer);
+		CUdeviceptr blurDifference = difference + mem_size;
+
+		{
+			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+			cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+			cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+			                     task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+			                     num_shifts);
+
+			void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *construct_gramian_args[] = {&blurDifference,
+			                                  &task->buffer.mem.device_pointer,
+			                                  &task->storage.transform.device_pointer,
+			                                  &task->storage.rank.device_pointer,
+			                                  &task->storage.XtWX.device_pointer,
+			                                  &task->storage.XtWY.device_pointer,
+			                                  &task->reconstruction_state.filter_window,
+			                                  &w, &h, &stride,
+			                                  &shift_stride, &r,
+			                                  &f,
+		                                      &task->buffer.pass_stride};
+
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
 		}
+
+		temporary_mem.free();
+
+		{
+			CUfunction cuFinalize;
+			cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+			cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+			void *finalize_args[] = {&output_ptr,
+					                 &task->storage.rank.device_pointer,
+					                 &task->storage.XtWX.device_pointer,
+					                 &task->storage.XtWY.device_pointer,
+					                 &task->filter_area,
+					                 &task->reconstruction_state.buffer_params.x,
+					                 &task->render_buffer.samples};
+			CUDA_GET_BLOCKSIZE(cuFinalize,
+			                   task->reconstruction_state.source_w,
+			                   task->reconstruction_state.source_h);
+			CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		}
+
+		cuda_assert(cuCtxSynchronize());
+
+		return !have_error();
 	}
 
-	void path_trace(RenderTile& rtile, int sample, bool branched)
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
 	{
 		if(have_error())
-			return;
+			return false;
+
+		CUDAContextScope scope(this);
+
+		CUfunction cuFilterCombineHalves;
+		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&mean_ptr,
+		                &variance_ptr,
+		                &a_ptr,
+		                &b_ptr,
+		                &rect,
+		                &r};
+		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+		cuda_assert(cuCtxSynchronize());
+
+		return !have_error();
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		CUDAContextScope scope(this);
+
+		CUfunction cuFilterDivideShadow;
+		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &a_ptr,
+		                &b_ptr,
+		                &sample_variance_ptr,
+		                &sv_variance_ptr,
+		                &buffer_variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset};
+		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+		cuda_assert(cuCtxSynchronize());
 
-		cuda_push_context();
+		return !have_error();
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		CUDAContextScope scope(this);
+
+		CUfunction cuFilterGetFeature;
+		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &mean_offset,
+		                &variance_offset,
+		                &mean_ptr,
+		                &variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset};
+		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+		cuda_assert(cuCtxSynchronize());
+
+		return !have_error();
+	}
 
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		CUDAContextScope scope(this);
+
+		CUfunction cuFilterDetectOutliers;
+		cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&image_ptr,
+		                &variance_ptr,
+		                &depth_ptr,
+		                &output_ptr,
+		                &task->rect,
+		                &task->buffer.pass_stride};
+
+		CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+		cuda_assert(cuCtxSynchronize());
+
+		return !have_error();
+	}
+
+	void denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
+	{
+		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+		denoising.render_buffer.samples = rtile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = rtile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+	}
+
+	void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles)
+	{
+		scoped_timer timer(&rtile.buffers->render_time);
+
+		if(have_error())
+			return;
+
+		CUDAContextScope scope(this);
 		CUfunction cuPathTrace;
-		CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
-		CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
 
-		/* get kernel function */
-		if(branched) {
+		/* Get kernel function. */
+		if(task.integrator_branched) {
 			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
 		}
 		else {
 			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
 		}
 
-		if(have_error())
+		if(have_error()) {
 			return;
+		}
 
-		/* pass in parameters */
-		void *args[] = {&d_buffer,
-		                &d_rng_state,
-		                &sample,
-		                &rtile.x,
-		                &rtile.y,
-		                &rtile.w,
-		                &rtile.h,
-		                &rtile.offset,
-		                &rtile.stride};
+		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
-		/* launch kernel */
-		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+		/* Allocate work tile. */
+		work_tiles.alloc(1);
+
+		WorkTile *wtile = work_tiles.data();
+		wtile->x = rtile.x;
+		wtile->y = rtile.y;
+		wtile->w = rtile.w;
+		wtile->h = rtile.h;
+		wtile->offset = rtile.offset;
+		wtile->stride = rtile.stride;
+		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
+
+		/* Prepare work size. More step samples render faster, but for now we
+		 * remain conservative for GPUs connected to a display to avoid driver
+		 * timeouts and display freezing. */
+		int min_blocks, num_threads_per_block;
+		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+		if(!info.display_device) {
+			min_blocks *= 8;
+		}
 
-		/*int num_registers;
-		cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
+		uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
 
-		printf("threads_per_block %d\n", threads_per_block);
-		printf("num_registers %d\n", num_registers);*/
+		/* Render all samples. */
+		int start_sample = rtile.start_sample;
+		int end_sample = rtile.start_sample + rtile.num_samples;
 
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-		int xblocks = (rtile.w + xthreads - 1)/xthreads;
-		int yblocks = (rtile.h + ythreads - 1)/ythreads;
+		for(int sample = start_sample; sample < end_sample; sample += step_samples) {
+			/* Setup and copy work tile to device. */
+			wtile->start_sample = sample;
+			wtile->num_samples = min(step_samples, end_sample - sample);;
+			work_tiles.copy_to_device();
 
-		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+			CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+			uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+			uint num_blocks = divide_up(total_work_size, num_threads_per_block);
 
-		cuda_assert(cuLaunchKernel(cuPathTrace,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
-		                           0, 0, args, 0));
+			/* Launch kernel. */
+			void *args[] = {&d_work_tiles,
+			                &total_work_size};
 
-		cuda_assert(cuCtxSynchronize());
+			cuda_assert(cuLaunchKernel(cuPathTrace,
+			                           num_blocks, 1, 1,
+			                           num_threads_per_block, 1, 1,
+			                           0, 0, args, 0));
+
+			cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
+			/* Update progress. */
+			rtile.sample = sample + wtile->num_samples;
+			task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples);
+
+			if(task.get_cancel()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
 	}
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@@ -891,7 +1727,7 @@ public:
 		if(have_error())
 			return;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilmConvert;
 		CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
@@ -937,7 +1773,7 @@ public:
 
 		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
 
-		cuda_pop_context();
+		cuda_assert(cuCtxSynchronize());
 	}
 
 	void shader(DeviceTask& task)
@@ -945,19 +1781,21 @@ public:
 		if(have_error())
 			return;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-		CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
 
 		/* get kernel function */
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
 			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
 		}
+		else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+		}
 		else {
-			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
+			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
 		}
 
 		/* do tasks in smaller chunks, so we can cancel it */
@@ -976,9 +1814,6 @@ public:
 				int arg = 0;
 				args[arg++] = &d_input;
 				args[arg++] = &d_output;
-				if(task.shader_eval_type < SHADER_EVAL_BAKE) {
-					args[arg++] = &d_output_luma;
-				}
 				args[arg++] = &task.shader_eval_type;
 				if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
 					args[arg++] = &task.shader_filter;
@@ -1010,8 +1845,6 @@ public:
 
 			task.update_progress(NULL);
 		}
-
-		cuda_pop_context();
 	}
 
 	CUdeviceptr map_pixels(device_ptr mem)
@@ -1041,117 +1874,95 @@ public:
 
 	void pixels_alloc(device_memory& mem)
 	{
-		if(!background) {
-			PixelMem pmem;
+		PixelMem pmem;
 
-			pmem.w = mem.data_width;
-			pmem.h = mem.data_height;
+		pmem.w = mem.data_width;
+		pmem.h = mem.data_height;
 
-			cuda_push_context();
-
-			glGenBuffers(1, &pmem.cuPBO);
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-			if(mem.data_type == TYPE_HALF)
-				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
-			else
-				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
-
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-			glGenTextures(1, &pmem.cuTexId);
-			glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-			if(mem.data_type == TYPE_HALF)
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-			else
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-			glBindTexture(GL_TEXTURE_2D, 0);
+		CUDAContextScope scope(this);
 
-			CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+		glGenBuffers(1, &pmem.cuPBO);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+		if(mem.data_type == TYPE_HALF)
+			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
+		else
+			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
 
-			if(result == CUDA_SUCCESS) {
-				cuda_pop_context();
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
-				mem.device_pointer = pmem.cuTexId;
-				pixel_mem_map[mem.device_pointer] = pmem;
+		glGenTextures(1, &pmem.cuTexId);
+		glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+		if(mem.data_type == TYPE_HALF)
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+		else
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glBindTexture(GL_TEXTURE_2D, 0);
 
-				mem.device_size = mem.memory_size();
-				stats.mem_alloc(mem.device_size);
+		CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
 
-				return;
-			}
-			else {
-				/* failed to register buffer, fallback to no interop */
-				glDeleteBuffers(1, &pmem.cuPBO);
-				glDeleteTextures(1, &pmem.cuTexId);
+		if(result == CUDA_SUCCESS) {
+			mem.device_pointer = pmem.cuTexId;
+			pixel_mem_map[mem.device_pointer] = pmem;
 
-				cuda_pop_context();
+			mem.device_size = mem.memory_size();
+			stats.mem_alloc(mem.device_size);
 
-				background = true;
-			}
+			return;
 		}
+		else {
+			/* failed to register buffer, fallback to no interop */
+			glDeleteBuffers(1, &pmem.cuPBO);
+			glDeleteTextures(1, &pmem.cuTexId);
 
-		Device::pixels_alloc(mem);
+			background = true;
+		}
 	}
 
 	void pixels_copy_from(device_memory& mem, int y, int w, int h)
 	{
-		if(!background) {
-			PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-			cuda_push_context();
-
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-			uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-			size_t offset = sizeof(uchar)*4*y*w;
-			memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
-			glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-			cuda_pop_context();
+		PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-			return;
-		}
+		CUDAContextScope scope(this);
 
-		Device::pixels_copy_from(mem, y, w, h);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+		uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+		size_t offset = sizeof(uchar)*4*y*w;
+		memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
+		glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 	}
 
 	void pixels_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			if(!background) {
-				PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-				cuda_push_context();
-
-				cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-				glDeleteBuffers(1, &pmem.cuPBO);
-				glDeleteTextures(1, &pmem.cuTexId);
-
-				cuda_pop_context();
+			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-				pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-				mem.device_pointer = 0;
+			CUDAContextScope scope(this);
 
-				stats.mem_free(mem.device_size);
-				mem.device_size = 0;
+			cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+			glDeleteBuffers(1, &pmem.cuPBO);
+			glDeleteTextures(1, &pmem.cuTexId);
 
-				return;
-			}
+			pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+			mem.device_pointer = 0;
 
-			Device::pixels_free(mem);
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
 		}
 	}
 
 	void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
+		assert(mem.type == MEM_PIXELS);
+
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 			float *vpointer;
 
-			cuda_push_context();
+			CUDAContextScope scope(this);
 
 			/* for multi devices, this assumes the inefficient method that we allocate
 			 * all pixels on the device even though we only render to a subset */
@@ -1240,8 +2051,6 @@ public:
 			glBindTexture(GL_TEXTURE_2D, 0);
 			glDisable(GL_TEXTURE_2D);
 
-			cuda_pop_context();
-
 			return;
 		}
 
@@ -1250,44 +2059,55 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-
-			bool branched = task->integrator_branched;
+		CUDAContextScope scope(this);
+
+		if(task->type == DeviceTask::RENDER) {
+			DeviceRequestedFeatures requested_features;
+			if(use_split_kernel()) {
+				if(split_kernel == NULL) {
+					split_kernel = new CUDASplitKernel(this);
+					split_kernel->load_kernels(requested_features);
+				}
+			}
 
-			/* Upload Bindless Mapping */
-			load_bindless_mapping();
+			device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
 
 			/* keep rendering tiles until done */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+			RenderTile tile;
+			DenoisingTask denoising(this);
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					if(use_split_kernel()) {
+						device_only_memory<uchar> void_buffer(this, "void_buffer");
+						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
+					else {
+						path_trace(*task, tile, work_tiles);
+					}
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
 
-					path_trace(tile, sample, branched);
-
-					tile.sample = sample + 1;
+					denoise(tile, denoising, *task);
 
 					task->update_progress(&tile, tile.w*tile.h);
 				}
 
 				task->release_tile(tile);
+
+				if(task->get_cancel()) {
+					if(task->need_finish_queue == false)
+						break;
+				}
 			}
+
+			work_tiles.free();
 		}
 		else if(task->type == DeviceTask::SHADER) {
-			/* Upload Bindless Mapping */
-			load_bindless_mapping();
-
 			shader(*task);
 
-			cuda_push_context();
 			cuda_assert(cuCtxSynchronize());
-			cuda_pop_context();
 		}
 	}
 
@@ -1307,13 +2127,17 @@ public:
 
 	void task_add(DeviceTask& task)
 	{
+		CUDAContextScope scope(this);
+
+		/* Load texture info. */
+		load_texture_info();
+
+		/* Synchronize all memory copies before executing task. */
+		cuda_assert(cuCtxSynchronize());
+
 		if(task.type == DeviceTask::FILM_CONVERT) {
 			/* must be done in main thread due to opengl access */
 			film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-
-			cuda_push_context();
-			cuda_assert(cuCtxSynchronize());
-			cuda_pop_context();
 		}
 		else {
 			task_pool.push(new CUDADeviceTask(this, task));
@@ -1329,8 +2153,237 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
+	friend class CUDAContextScope;
 };
 
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device)
+: device(device)
+{
+	cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+	cuda_assert(cuCtxPopCurrent(NULL));
+}
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		if(device->have_error())
+			return false;
+
+		CUDAContextScope scope(device);
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+		                           xblocks, 1, 1, /* blocks */
+		                           threads_per_block, 1, 1, /* threads */
+		                           0, 0, args, 0));
+
+		return !device->have_error();
+	}
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	CUDAContextScope scope(device);
+
+	device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+	size_buffer.alloc(1);
+	size_buffer.zero_to_device();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+	                           1, 1, 1,
+	                           1, 1, 1,
+	                           0, 0, (void**)&args, 0));
+
+	size_buffer.copy_from_device(0, 1, 1);
+	size_t size = size_buffer[0];
+	size_buffer.free();
+
+	return size;
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	CUDAContextScope scope(device);
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                                const DeviceRequestedFeatures&)
+{
+	CUDAContextScope scope(device);
+	CUfunction func;
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
+{
+	CUDAContextScope scope(device);
+	size_t free;
+	size_t total;
+
+	cuda_assert(cuMemGetInfo(&free, &total));
+
+	VLOG(1) << "Maximum device allocation size: "
+	        << string_human_readable_number(free) << " bytes. ("
+	        << string_human_readable_size(free) << ").";
+
+	size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+	size_t side = round_down((int)sqrt(num_elements), 32);
+	int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+	VLOG(1) << "Global size: " << global_size << ".";
+	return global_size;
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
@@ -1341,7 +2394,7 @@ bool device_cuda_init(void)
 		return result;
 
 	initialized = true;
-	int cuew_result = cuewInit();
+	int cuew_result = cuewInit(CUEW_INIT_CUDA);
 	if(cuew_result == CUEW_SUCCESS) {
 		VLOG(1) << "CUEW initialization succeeded";
 		if(CUDADevice::have_precompiled_kernels()) {
@@ -1377,18 +2430,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
 	return new CUDADevice(info, stats, background);
 }
 
-void device_cuda_info(vector<DeviceInfo>& devices)
+static CUresult device_cuda_safe_init()
 {
-	CUresult result;
-	int count = 0;
+#ifdef _WIN32
+	__try {
+		return cuInit(0);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER) {
+		/* Ignore crashes inside the CUDA driver and hope we can
+		 * survive even with corrupted CUDA installs. */
+		fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+	}
+
+	return CUDA_ERROR_NO_DEVICE;
+#else
+	return cuInit(0);
+#endif
+}
 
-	result = cuInit(0);
+void device_cuda_info(vector<DeviceInfo>& devices)
+{
+	CUresult result = device_cuda_safe_init();
 	if(result != CUDA_SUCCESS) {
 		if(result != CUDA_ERROR_NO_DEVICE)
 			fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
 		return;
 	}
 
+	int count = 0;
 	result = cuDeviceGetCount(&count);
 	if(result != CUDA_SUCCESS) {
 		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
@@ -1399,14 +2468,18 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 	for(int num = 0; num < count; num++) {
 		char name[256];
-		int attr;
 
-		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
+		result = cuDeviceGetName(name, 256, num);
+		if(result != CUDA_SUCCESS) {
+			fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
 			continue;
+		}
 
 		int major;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
-		if(major < 2) {
+		if(major < 3) {
+			VLOG(1) << "Ignoring device \"" << name
+			        << "\", this graphics card is no longer supported.";
 			continue;
 		}
 
@@ -1416,9 +2489,10 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.description = string(name);
 		info.num = num;
 
-		info.advanced_shading = (major >= 2);
-		info.has_bindless_textures = (major >= 3);
-		info.pack_images = false;
+		info.advanced_shading = (major >= 3);
+		info.has_half_images = (major >= 3);
+		info.has_volume_decoupled = false;
+		info.bvh_layout_mask = BVH_LAYOUT_BVH2;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
@@ -1430,14 +2504,23 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		                        (unsigned int)pci_location[1],
 		                        (unsigned int)pci_location[2]);
 
-		/* if device has a kernel timeout, assume it is used for display */
-		if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
+		/* If device has a kernel timeout and no compute preemption, we assume
+		 * it is connected to a display and will freeze the display while doing
+		 * computations. */
+		int timeout_attr = 0, preempt_attr = 0;
+		cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+		cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+		if(timeout_attr && !preempt_attr) {
+			VLOG(1) << "Device is recognized as display.";
 			info.description += " (Display)";
 			info.display_device = true;
 			display_devices.push_back(info);
 		}
-		else
+		else {
 			devices.push_back(info);
+		}
+		VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
 	}
 
 	if(!display_devices.empty())
@@ -1446,7 +2529,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 string device_cuda_capabilities(void)
 {
-	CUresult result = cuInit(0);
+	CUresult result = device_cuda_safe_init();
 	if(result != CUDA_SUCCESS) {
 		if(result != CUDA_ERROR_NO_DEVICE) {
 			return string("Error initializing CUDA: ") + cuewErrorString(result);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..644cf6cd10e
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+DenoisingTask::DenoisingTask(Device *device)
+: tiles_mem(device, "denoising tiles_mem", MEM_READ_WRITE),
+  storage(device),
+  buffer(device),
+  device(device)
+{
+}
+
+DenoisingTask::~DenoisingTask()
+{
+	storage.XtWX.free();
+	storage.XtWY.free();
+	storage.transform.free();
+	storage.rank.free();
+	storage.temporary_1.free();
+	storage.temporary_2.free();
+	storage.temporary_color.free();
+	buffer.mem.free();
+	tiles_mem.free();
+}
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+	radius = task.denoising_radius;
+	nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+	if(task.denoising_relative_pca) {
+		pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+	}
+	else {
+		pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+	}
+
+	render_buffer.pass_stride = task.pass_stride;
+	render_buffer.denoising_data_offset  = task.pass_denoising_data;
+	render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+	rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+	rect = rect_expand(rect, radius);
+	rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3]));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+	tiles = (TilesInfo*) tiles_mem.alloc(sizeof(TilesInfo)/sizeof(int));
+
+	device_ptr buffers[9];
+	for(int i = 0; i < 9; i++) {
+		buffers[i] = rtiles[i].buffer;
+		tiles->offsets[i] = rtiles[i].offset;
+		tiles->strides[i] = rtiles[i].stride;
+	}
+	tiles->x[0] = rtiles[3].x;
+	tiles->x[1] = rtiles[4].x;
+	tiles->x[2] = rtiles[5].x;
+	tiles->x[3] = rtiles[5].x + rtiles[5].w;
+	tiles->y[0] = rtiles[1].y;
+	tiles->y[1] = rtiles[4].y;
+	tiles->y[2] = rtiles[7].y;
+	tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+	render_buffer.offset = rtiles[4].offset;
+	render_buffer.stride = rtiles[4].stride;
+	render_buffer.ptr    = rtiles[4].buffer;
+
+	functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+	/* Allocate denoising buffer. */
+	buffer.passes = 14;
+	buffer.width = rect.z - rect.x;
+	buffer.stride = align_up(buffer.width, 4);
+	buffer.h = rect.w - rect.y;
+	buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_sub_ptr_alignment(), sizeof(float)));
+	buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
+
+	device_ptr null_ptr = (device_ptr) 0;
+
+	/* Prefilter shadow feature. */
+	{
+		device_sub_ptr unfiltered_a   (buffer.mem, 0,                    buffer.pass_stride);
+		device_sub_ptr unfiltered_b   (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr sample_var     (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr buffer_var     (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr filtered_var   (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_1(buffer.mem, 7*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_2(buffer.mem, 8*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_3(buffer.mem, 9*buffer.pass_stride, buffer.pass_stride);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+		functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+		/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+		nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+		functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+		/* Reuse memory, the previous data isn't needed anymore. */
+		device_ptr filtered_a = *buffer_var,
+		           filtered_b = *sample_var;
+		/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+		nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+		functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+		functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+		device_ptr residual_var = *sample_var_var;
+		/* Estimate the residual variance between the two filtered halves. */
+		functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+		device_ptr final_a = *unfiltered_a,
+		           final_b = *unfiltered_b;
+		/* Use the residual variance for a second filter pass. */
+		nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+		functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+		functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+		/* Combine the two double-filtered halves to a final shadow feature. */
+		device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride);
+		functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+	}
+
+	/* Prefilter general features. */
+	{
+		device_sub_ptr unfiltered     (buffer.mem,  8*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr variance       (buffer.mem,  9*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_1(buffer.mem, 10*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_2(buffer.mem, 11*buffer.pass_stride, buffer.pass_stride);
+		device_sub_ptr nlm_temporary_3(buffer.mem, 12*buffer.pass_stride, buffer.pass_stride);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		int mean_from[]     = { 0, 1, 2, 12, 6,  7, 8 };
+		int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
+		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7};
+		for(int pass = 0; pass < 7; pass++) {
+			device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
+			/* Get the unfiltered pass and its variance from the RenderBuffers. */
+			functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+			/* Smooth the pass and store the result in the denoising buffers. */
+			nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+			functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+		}
+	}
+
+	/* Copy color passes. */
+	{
+		int mean_from[]     = {20, 21, 22};
+		int variance_from[] = {23, 24, 25};
+		int mean_to[]       = { 8,  9, 10};
+		int variance_to[]   = {11, 12, 13};
+		int num_color_passes = 3;
+
+		storage.temporary_color.alloc_to_device(3*buffer.pass_stride, false);
+
+		for(int pass = 0; pass < num_color_passes; pass++) {
+			device_sub_ptr color_pass(storage.temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
+			device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
+			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+		}
+
+		{
+			device_sub_ptr depth_pass    (buffer.mem,                                 0,   buffer.pass_stride);
+			device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
+			device_sub_ptr output_pass   (buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
+			functions.detect_outliers(storage.temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+		}
+	}
+
+	storage.w = filter_area.z;
+	storage.h = filter_area.w;
+	storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false);
+	storage.rank.alloc_to_device(storage.w*storage.h, false);
+
+	functions.construct_transform();
+
+	device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
+	device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
+	temporary_1.alloc_to_device(buffer.pass_stride, false);
+	temporary_2.alloc_to_device(buffer.pass_stride, false);
+	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
+	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
+
+	reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+	reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+	                                               render_buffer.stride,
+	                                               render_buffer.pass_stride,
+	                                               render_buffer.denoising_clean_offset);
+	reconstruction_state.source_w = rect.z-rect.x;
+	reconstruction_state.source_h = rect.w-rect.y;
+
+	{
+		device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
+		device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
+		functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr);
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..77a82d0ad04
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+	/* Parameters of the denoising algorithm. */
+	int radius;
+	float nlm_k_2;
+	float pca_threshold;
+
+	/* Pointer and parameters of the RenderBuffers. */
+	struct RenderBuffers {
+		int denoising_data_offset;
+		int denoising_clean_offset;
+		int pass_stride;
+		int offset;
+		int stride;
+		device_ptr ptr;
+		int samples;
+	} render_buffer;
+
+	TilesInfo *tiles;
+	device_vector<int> tiles_mem;
+	void tiles_from_rendertiles(RenderTile *rtiles);
+
+	int4 rect;
+	int4 filter_area;
+
+	struct DeviceFunctions {
+		function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
+		              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
+		              device_ptr variance_ptr, /* Contains the variance of the guide image. */
+		              device_ptr out_ptr       /* The filtered output is written into this image. */
+		              )> non_local_means;
+		function<bool(device_ptr color_ptr,
+		              device_ptr color_variance_ptr,
+		              device_ptr output_ptr
+		              )> reconstruct;
+		function<bool()> construct_transform;
+
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr,
+		              int r,
+		              int4 rect
+		              )> combine_halves;
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr sample_variance_ptr,
+		              device_ptr sv_variance_ptr,
+		              device_ptr buffer_variance_ptr
+		              )> divide_shadow;
+		function<bool(int mean_offset,
+		              int variance_offset,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr
+		              )> get_feature;
+		function<bool(device_ptr image_ptr,
+		              device_ptr variance_ptr,
+		              device_ptr depth_ptr,
+		              device_ptr output_ptr
+		              )> detect_outliers;
+		function<bool(device_ptr*)> set_tiles;
+	} functions;
+
+	/* Stores state of the current Reconstruction operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct ReconstructionState {
+		device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+
+		int4 filter_window;
+		int4 buffer_params;
+
+		int source_w;
+		int source_h;
+	} reconstruction_state;
+
+	/* Stores state of the current NLM operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct NLMState {
+		device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+		device_ptr temporary_3_ptr;
+
+		int r;      /* Search radius of the filter. */
+		int f;      /* Patch size of the filter. */
+		float a;    /* Variance compensation factor in the MSE estimation. */
+		float k_2;  /* Squared value of the k parameter of the filter. */
+
+		void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+	} nlm_state;
+
+	struct Storage {
+		device_only_memory<float>  transform;
+		device_only_memory<int>    rank;
+		device_only_memory<float>  XtWX;
+		device_only_memory<float3> XtWY;
+		device_only_memory<float>  temporary_1;
+		device_only_memory<float>  temporary_2;
+		device_only_memory<float>  temporary_color;
+		int w;
+		int h;
+
+		Storage(Device *device)
+		: transform(device, "denoising transform"),
+		  rank(device, "denoising rank"),
+		  XtWX(device, "denoising XtWX"),
+		  XtWY(device, "denoising XtWY"),
+		  temporary_1(device, "denoising NLM temporary 1"),
+		  temporary_2(device, "denoising NLM temporary 2"),
+		  temporary_color(device, "denoising temporary color")
+		{}
+	} storage;
+
+	DenoisingTask(Device *device);
+	~DenoisingTask();
+
+	void init_from_devicetask(const DeviceTask &task);
+
+	bool run_denoising();
+
+	struct DenoiseBuffers {
+		int pass_stride;
+		int passes;
+		int stride;
+		int h;
+		int width;
+		device_only_memory<float> mem;
+
+		DenoiseBuffers(Device *device)
+		: mem(device, "denoising pixel buffer")
+	    {}
+	} buffer;
+
+protected:
+	Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
new file mode 100644
index 00000000000..c6248fcf88b
--- /dev/null
+++ b/intern/cycles/device/device_memory.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+#include "device/device_memory.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Device Memory */
+
+device_memory::device_memory(Device *device, const char *name, MemoryType type)
+: data_type(device_type_traits<uchar>::data_type),
+  data_elements(device_type_traits<uchar>::num_elements),
+  data_size(0),
+  device_size(0),
+  data_width(0),
+  data_height(0),
+  data_depth(0),
+  type(type),
+  name(name),
+  interpolation(INTERPOLATION_NONE),
+  extension(EXTENSION_REPEAT),
+  device(device),
+  device_pointer(0),
+  host_pointer(0),
+  shared_pointer(0)
+{
+}
+
+device_memory::~device_memory()
+{
+}
+
+void *device_memory::host_alloc(size_t size)
+{
+	if(!size) {
+		return 0;
+	}
+
+	void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
+
+	if(ptr) {
+		util_guarded_mem_alloc(size);
+	}
+	else {
+		throw std::bad_alloc();
+	}
+
+	return ptr;
+}
+
+void device_memory::host_free()
+{
+	if(host_pointer) {
+		util_guarded_mem_free(memory_size());
+		util_aligned_free((void*)host_pointer);
+		host_pointer = 0;
+	}
+}
+
+void device_memory::device_alloc()
+{
+	assert(!device_pointer && type != MEM_TEXTURE);
+	device->mem_alloc(*this);
+}
+
+void device_memory::device_free()
+{
+	if(device_pointer) {
+		device->mem_free(*this);
+	}
+}
+
+void device_memory::device_copy_to()
+{
+	if(host_pointer) {
+		device->mem_copy_to(*this);
+	}
+}
+
+void device_memory::device_copy_from(int y, int w, int h, int elem)
+{
+	assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
+	device->mem_copy_from(*this, y, w, h, elem);
+}
+
+void device_memory::device_zero()
+{
+	if(data_size) {
+		device->mem_zero(*this);
+	}
+}
+
+/* Device Sub Ptr */
+
+device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size)
+: device(mem.device)
+{
+	ptr = device->mem_alloc_sub_ptr(mem, offset, size);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+	device->mem_free_sub_ptr(ptr);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc6802..d8fe41e78bb 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -19,46 +19,47 @@
 
 /* Device Memory
  *
- * This file defines data types that can be used in device memory arrays, and
- * a device_vector<T> type to store such arrays.
- *
- * device_vector<T> contains an STL vector, metadata about the data type,
- * dimensions, elements, and a device pointer. For the CPU device this is just
- * a pointer to the STL vector data, as no copying needs to take place. For
- * other devices this is a pointer to device memory, where we will copy memory
- * to and from. */
+ * Data types for allocating, copying and freeing device memory. */
 
-#include "util_debug.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_half.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class Device;
+
 enum MemoryType {
 	MEM_READ_ONLY,
-	MEM_WRITE_ONLY,
-	MEM_READ_WRITE
+	MEM_READ_WRITE,
+	MEM_DEVICE_ONLY,
+	MEM_TEXTURE,
+	MEM_PIXELS
 };
 
 /* Supported Data Types */
 
 enum DataType {
+	TYPE_UNKNOWN,
 	TYPE_UCHAR,
 	TYPE_UINT,
 	TYPE_INT,
 	TYPE_FLOAT,
-	TYPE_HALF
+	TYPE_HALF,
+	TYPE_UINT64,
 };
 
 static inline size_t datatype_size(DataType datatype) 
 {
 	switch(datatype) {
+		case TYPE_UNKNOWN: return 1;
 		case TYPE_UCHAR: return sizeof(uchar);
 		case TYPE_FLOAT: return sizeof(float);
 		case TYPE_UINT: return sizeof(uint);
 		case TYPE_INT: return sizeof(int);
 		case TYPE_HALF: return sizeof(half);
+		case TYPE_UINT64: return sizeof(uint64_t);
 		default: return 0;
 	}
 }
@@ -66,8 +67,8 @@ static inline size_t datatype_size(DataType datatype)
 /* Traits for data types */
 
 template<typename T> struct device_type_traits {
-	static const DataType data_type = TYPE_UCHAR;
-	static const int num_elements = 0;
+	static const DataType data_type = TYPE_UNKNOWN;
+	static const int num_elements = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
@@ -142,7 +143,7 @@ template<> struct device_type_traits<float2> {
 
 template<> struct device_type_traits<float3> {
 	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 3;
+	static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<float4> {
@@ -160,126 +161,318 @@ template<> struct device_type_traits<half4> {
 	static const int num_elements = 4;
 };
 
-/* Device Memory */
+template<> struct device_type_traits<uint64_t> {
+	static const DataType data_type = TYPE_UINT64;
+	static const int num_elements = 1;
+};
+
+/* Device Memory
+ *
+ * Base class for all device memory. This should not be allocated directly,
+ * instead the appropriate subclass can be used. */
 
 class device_memory
 {
 public:
 	size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+	size_t memory_elements_size(int elements) {
+		return elements*data_elements*datatype_size(data_type);
+	}
 
-	/* data information */
+	/* Data information. */
 	DataType data_type;
 	int data_elements;
-	device_ptr data_pointer;
 	size_t data_size;
 	size_t device_size;
 	size_t data_width;
 	size_t data_height;
 	size_t data_depth;
+	MemoryType type;
+	const char *name;
+	InterpolationType interpolation;
+	ExtensionType extension;
 
-	/* device pointer */
+	/* Pointers. */
+	Device *device;
 	device_ptr device_pointer;
+	void *host_pointer;
+	void *shared_pointer;
+
+	virtual ~device_memory();
 
 protected:
-	device_memory() {}
-	virtual ~device_memory() { assert(!device_pointer); }
+	friend class CUDADevice;
 
-	/* no copying */
+	/* Only create through subclasses. */
+	device_memory(Device *device, const char *name, MemoryType type);
+
+	/* No copying allowed. */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
+
+	/* Host allocation on the device. All host_pointer memory should be
+	 * allocated with these functions, for devices that support using
+	 * the same pointer for host and device. */
+	void *host_alloc(size_t size);
+	void host_free();
+
+	/* Device memory allocation and copying. */
+	void device_alloc();
+	void device_free();
+	void device_copy_to();
+	void device_copy_from(int y, int w, int h, int elem);
+	void device_zero();
+};
+
+/* Device Only Memory
+ *
+ * Working memory only needed by the device, with no corresponding allocation
+ * on the host. Only used internally in the device implementations. */
+
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+	device_only_memory(Device *device, const char *name)
+	: device_memory(device, name, MEM_DEVICE_ONLY)
+	{
+		data_type = device_type_traits<T>::data_type;
+		data_elements = max(device_type_traits<T>::num_elements, 1);
+	}
+
+	virtual ~device_only_memory()
+	{
+		free();
+	}
+
+	void alloc_to_device(size_t num, bool shrink_to_fit = true)
+	{
+		size_t new_size = num;
+		bool reallocate;
+
+		if(shrink_to_fit) {
+			reallocate = (data_size != new_size);
+		}
+		else {
+			reallocate = (data_size < new_size);
+		}
+
+		if(reallocate) {
+			device_free();
+			data_size = new_size;
+			device_alloc();
+		}
+	}
+
+	void free()
+	{
+		device_free();
+		data_size = 0;
+	}
+
+	void zero_to_device()
+	{
+		device_zero();
+	}
 };
 
-/* Device Vector */
+/* Device Vector
+ *
+ * Data vector to exchange data between host and device. Memory will be
+ * allocated on the host first with alloc() and resize, and then filled
+ * in and copied to the device with copy_to_device(). Or alternatively
+ * allocated and set to zero on the device with zero_to_device().
+ *
+ * When using memory type MEM_TEXTURE, a pointer to this memory will be
+ * automatically attached to kernel globals, using the provided name
+ * matching an entry in kernel_textures.h. */
 
 template<typename T> class device_vector : public device_memory
 {
 public:
-	device_vector()
+	device_vector(Device *device, const char *name, MemoryType type)
+	: device_memory(device, name, type)
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
 
 		assert(data_elements > 0);
-
-		device_pointer = 0;
 	}
 
-	virtual ~device_vector() {}
+	virtual ~device_vector()
+	{
+		free();
+	}
 
-	/* vector functions */
-	T *resize(size_t width, size_t height = 0, size_t depth = 0)
+	/* Host memory allocation. */
+	T *alloc(size_t width, size_t height = 0, size_t depth = 0)
 	{
-		data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
-		if(data.resize(data_size) == NULL) {
-			clear();
-			return NULL;
+		size_t new_size = size(width, height, depth);
+
+		if(new_size != data_size) {
+			device_free();
+			host_free();
+			host_pointer = host_alloc(sizeof(T)*new_size);
+			assert(device_pointer == 0);
 		}
+
+		data_size = new_size;
 		data_width = width;
 		data_height = height;
 		data_depth = depth;
-		if(data_size == 0) {
-			data_pointer = 0;
-			return NULL;
-		}
-		data_pointer = (device_ptr)&data[0];
-		return &data[0];
-	}
 
-	T *copy(T *ptr, size_t width, size_t height = 0, size_t depth = 0)
-	{
-		T *mem = resize(width, height, depth);
-		if(mem != NULL) {
-			memcpy(mem, ptr, memory_size());
-		}
-		return mem;
+		return data();
 	}
 
-	void copy_at(T *ptr, size_t offset, size_t size)
+	/* Host memory resize. Only use this if the original data needs to be
+	 * preserved, it is faster to call alloc() if it can be discarded. */
+	T *resize(size_t width, size_t height = 0, size_t depth = 0)
 	{
-		if(size > 0) {
-			size_t mem_size = size*data_elements*datatype_size(data_type);
-			memcpy(&data[0] + offset, ptr, mem_size);
+		size_t new_size = size(width, height, depth);
+
+		if(new_size != data_size) {
+			void *new_ptr = host_alloc(sizeof(T)*new_size);
+
+			if(new_size && data_size) {
+				size_t min_size = ((new_size < data_size)? new_size: data_size);
+				memcpy((T*)new_ptr, (T*)host_pointer, sizeof(T)*min_size);
+			}
+
+			device_free();
+			host_free();
+			host_pointer = new_ptr;
+			assert(device_pointer == 0);
 		}
-	}
 
-	void reference(T *ptr, size_t width, size_t height = 0, size_t depth = 0)
-	{
-		data.clear();
-		data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
-		data_pointer = (device_ptr)ptr;
+		data_size = new_size;
 		data_width = width;
 		data_height = height;
 		data_depth = depth;
+
+		return data();
 	}
 
-	void clear()
+	/* Take over data from an existing array. */
+	void steal_data(array<T>& from)
 	{
-		data.clear();
-		data_pointer = 0;
+		device_free();
+		host_free();
+
+		data_size = from.size();
 		data_width = 0;
 		data_height = 0;
 		data_depth = 0;
+		host_pointer = from.steal_pointer();
+		assert(device_pointer == 0);
+	}
+
+	/* Free device and host memory. */
+	void free()
+	{
+		device_free();
+		host_free();
+
 		data_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		host_pointer = 0;
+		assert(device_pointer == 0);
 	}
 
 	size_t size()
 	{
-		return data.size();
+		return data_size;
+	}
+
+	T* data()
+	{
+		return (T*)host_pointer;
+	}
+
+	T& operator[](size_t i)
+	{
+		assert(i < data_size);
+		return data()[i];
+	}
+
+	void copy_to_device()
+	{
+		device_copy_to();
+	}
+
+	void copy_from_device(int y, int w, int h)
+	{
+		device_copy_from(y, w, h, sizeof(T));
+	}
+
+	void zero_to_device()
+	{
+		device_zero();
 	}
 
-	T* get_data()
+protected:
+	size_t size(size_t width, size_t height, size_t depth)
 	{
-		return &data[0];
+		return width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
 	}
+};
+
+/* Pixel Memory
+ *
+ * Device memory to efficiently draw as pixels to the screen in interactive
+ * rendering. Only copying pixels from the device is supported, not copying to. */
+
+template<typename T> class device_pixels : public device_vector<T>
+{
+public:
+	device_pixels(Device *device, const char *name)
+	: device_vector<T>(device, name, MEM_PIXELS)
+	{
+	}
+
+	void alloc_to_device(size_t width, size_t height, size_t depth = 0)
+	{
+		device_vector<T>::alloc(width, height, depth);
+
+		if(!device_memory::device_pointer) {
+			device_memory::device_alloc();
+		}
+	}
+
+	T *copy_from_device(int y, int w, int h)
+	{
+		device_memory::device_copy_from(y, w, h, sizeof(T));
+		return device_vector<T>::data();
+	}
+};
+
+/* Device Sub Memory
+ *
+ * Pointer into existing memory. It is not allocated separately, but created
+ * from an already allocated base memory. It is freed automatically when it
+ * goes out of scope, which should happen before base memory is freed.
+ *
+ * Note: some devices require offset and size of the sub_ptr to be properly
+ * aligned to device->mem_address_alingment(). */
+
+class device_sub_ptr
+{
+public:
+	device_sub_ptr(device_memory& mem, int offset, int size);
+	~device_sub_ptr();
+
+	device_ptr operator*() const
+	{
+		return ptr;
+	}
+
+protected:
+	/* No copying. */
+	device_sub_ptr& operator = (const device_sub_ptr&);
 
-private:
-	array<T> data;
+	Device *device;
+	device_ptr ptr;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 61d78ee65de..3a4c08b6eb2 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,17 +17,17 @@
 #include <stdlib.h>
 #include <sstream>
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,16 +43,22 @@ public:
 	};
 
 	list<SubDevice> devices;
-	device_ptr unique_ptr;
+	device_ptr unique_key;
 
 	MultiDevice(DeviceInfo& info, Stats &stats, bool background_)
-	: Device(info, stats, background_), unique_ptr(1)
+	: Device(info, stats, background_), unique_key(1)
 	{
-		Device *device;
-
 		foreach(DeviceInfo& subinfo, info.multi_devices) {
-			device = Device::create(subinfo, sub_stats_, background);
-			devices.push_back(SubDevice(device));
+			Device *device = Device::create(subinfo, sub_stats_, background);
+
+			/* Always add CPU devices at the back since GPU devices can change
+			 * host memory pointers, which CPU uses as device pointer. */
+			if(subinfo.type == DEVICE_CPU) {
+				devices.push_back(SubDevice(device));
+			}
+			else {
+				devices.push_front(SubDevice(device));
+			}
 		}
 
 #ifdef WITH_NETWORK
@@ -63,7 +69,7 @@ public:
 		vector<string> servers = discovery.get_server_list();
 
 		foreach(string& server, servers) {
-			device = device_network_create(info, stats, server.c_str());
+			Device *device = device_network_create(info, stats, server.c_str());
 			if(device)
 				devices.push_back(SubDevice(device));
 		}
@@ -106,71 +112,102 @@ public:
 		return true;
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem)
 	{
+		device_ptr key = unique_key++;
+
 		foreach(SubDevice& sub, devices) {
+			mem.device = sub.device;
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
-			sub.ptr_map[unique_ptr] = mem.device_pointer;
+			mem.device_size = 0;
+
+			sub.device->mem_alloc(mem);
+			sub.ptr_map[key] = mem.device_pointer;
 		}
 
-		mem.device_pointer = unique_ptr++;
+		mem.device = this;
+		mem.device_pointer = key;
 		stats.mem_alloc(mem.device_size);
 	}
 
 	void mem_copy_to(device_memory& mem)
 	{
-		device_ptr tmp = mem.device_pointer;
+		device_ptr existing_key = mem.device_pointer;
+		device_ptr key = (existing_key)? existing_key: unique_key++;
+		size_t existing_size = mem.device_size;
 
 		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = sub.ptr_map[tmp];
+			mem.device = sub.device;
+			mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
+			mem.device_size = existing_size;
+
 			sub.device->mem_copy_to(mem);
+			sub.ptr_map[key] = mem.device_pointer;
 		}
 
-		mem.device_pointer = tmp;
+		mem.device = this;
+		mem.device_pointer = key;
+		stats.mem_alloc(mem.device_size - existing_size);
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 	{
-		device_ptr tmp = mem.device_pointer;
+		device_ptr key = mem.device_pointer;
 		int i = 0, sub_h = h/devices.size();
 
 		foreach(SubDevice& sub, devices) {
 			int sy = y + i*sub_h;
 			int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
 
-			mem.device_pointer = sub.ptr_map[tmp];
+			mem.device = sub.device;
+			mem.device_pointer = sub.ptr_map[key];
+
 			sub.device->mem_copy_from(mem, sy, w, sh, elem);
 			i++;
 		}
 
-		mem.device_pointer = tmp;
+		mem.device = this;
+		mem.device_pointer = key;
 	}
 
 	void mem_zero(device_memory& mem)
 	{
-		device_ptr tmp = mem.device_pointer;
+		device_ptr existing_key = mem.device_pointer;
+		device_ptr key = (existing_key)? existing_key: unique_key++;
+		size_t existing_size = mem.device_size;
 
 		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = sub.ptr_map[tmp];
+			mem.device = sub.device;
+			mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
+			mem.device_size = existing_size;
+
 			sub.device->mem_zero(mem);
+			sub.ptr_map[key] = mem.device_pointer;
 		}
 
-		mem.device_pointer = tmp;
+		mem.device = this;
+		mem.device_pointer = key;
+		stats.mem_alloc(mem.device_size - existing_size);
 	}
 
 	void mem_free(device_memory& mem)
 	{
-		device_ptr tmp = mem.device_pointer;
-		stats.mem_free(mem.device_size);
+		device_ptr key = mem.device_pointer;
+		size_t existing_size = mem.device_size;
 
 		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = sub.ptr_map[tmp];
+			mem.device = sub.device;
+			mem.device_pointer = sub.ptr_map[key];
+			mem.device_size = existing_size;
+
 			sub.device->mem_free(mem);
-			sub.ptr_map.erase(sub.ptr_map.find(tmp));
+			sub.ptr_map.erase(sub.ptr_map.find(key));
 		}
 
+		mem.device = this;
 		mem.device_pointer = 0;
+		mem.device_size = 0;
+		stats.mem_free(existing_size);
 	}
 
 	void const_copy_to(const char *name, void *host, size_t size)
@@ -179,85 +216,10 @@ public:
 			sub.device->const_copy_to(name, host, size);
 	}
 
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType
-	               interpolation,
-	               ExtensionType extension)
-	{
-		VLOG(1) << "Texture allocate: " << name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
-		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = 0;
-			sub.device->tex_alloc(name, mem, interpolation, extension);
-			sub.ptr_map[unique_ptr] = mem.device_pointer;
-		}
-
-		mem.device_pointer = unique_ptr++;
-		stats.mem_alloc(mem.device_size);
-	}
-
-	void tex_free(device_memory& mem)
-	{
-		device_ptr tmp = mem.device_pointer;
-		stats.mem_free(mem.device_size);
-
-		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->tex_free(mem);
-			sub.ptr_map.erase(sub.ptr_map.find(tmp));
-		}
-
-		mem.device_pointer = 0;
-	}
-
-	void pixels_alloc(device_memory& mem)
-	{
-		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = 0;
-			sub.device->pixels_alloc(mem);
-			sub.ptr_map[unique_ptr] = mem.device_pointer;
-		}
-
-		mem.device_pointer = unique_ptr++;
-	}
-
-	void pixels_free(device_memory& mem)
-	{
-		device_ptr tmp = mem.device_pointer;
-
-		foreach(SubDevice& sub, devices) {
-			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->pixels_free(mem);
-			sub.ptr_map.erase(sub.ptr_map.find(tmp));
-		}
-
-		mem.device_pointer = 0;
-	}
-
-	void pixels_copy_from(device_memory& mem, int y, int w, int h)
-	{
-		device_ptr tmp = mem.device_pointer;
-		int i = 0, sub_h = h/devices.size();
-
-		foreach(SubDevice& sub, devices) {
-			int sy = y + i*sub_h;
-			int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
-
-			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->pixels_copy_from(mem, sy, w, sh);
-			i++;
-		}
-
-		mem.device_pointer = tmp;
-	}
-
 	void draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
-		device_ptr tmp = rgba.device_pointer;
+		device_ptr key = rgba.device_pointer;
 		int i = 0, sub_h = h/devices.size();
 		int sub_height = height/devices.size();
 
@@ -268,12 +230,12 @@ public:
 			int sdy = dy + i*sub_height;
 			/* adjust math for w/width */
 
-			rgba.device_pointer = sub.ptr_map[tmp];
+			rgba.device_pointer = sub.ptr_map[key];
 			sub.device->draw_pixels(rgba, sy, w, sh, dx, sdy, width, sheight, transparent, draw_params);
 			i++;
 		}
 
-		rgba.device_pointer = tmp;
+		rgba.device_pointer = key;
 	}
 
 	void map_tile(Device *sub_device, RenderTile& tile)
@@ -281,7 +243,6 @@ public:
 		foreach(SubDevice& sub, devices) {
 			if(sub.device == sub_device) {
 				if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer];
-				if(tile.rng_state) tile.rng_state = sub.ptr_map[tile.rng_state];
 			}
 		}
 	}
@@ -299,6 +260,81 @@ public:
 		return -1;
 	}
 
+	void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+
+			/* If the tile was rendered on another device, copy its memory to
+			 * to the current device now, for the duration of the denoising task.
+			 * Note that this temporarily modifies the RenderBuffers and calls
+			 * the device, so this function is not thread safe. */
+			device_vector<float> &mem = tiles[i].buffers->buffer;
+			if(mem.device != sub_device) {
+				/* Only copy from device to host once. This is faster, but
+				 * also required for the case where a CPU thread is denoising
+				 * a tile rendered on the GPU. In that case we have to avoid
+				 * overwriting the buffer being denoised by the CPU thread. */
+				if(!tiles[i].buffers->map_neighbor_copied) {
+					tiles[i].buffers->map_neighbor_copied = true;
+					mem.copy_from_device(0, mem.data_size, 1);
+				}
+
+				Device *original_device = mem.device;
+				device_ptr original_ptr = mem.device_pointer;
+				size_t original_size = mem.device_size;
+
+				mem.device = sub_device;
+				mem.device_pointer = 0;
+				mem.device_size = 0;
+
+				mem.copy_to_device();
+				tiles[i].buffer = mem.device_pointer;
+
+				mem.device = original_device;
+				mem.device_pointer = original_ptr;
+				mem.device_size = original_size;
+			}
+		}
+	}
+
+	void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+
+			device_vector<float> &mem = tiles[i].buffers->buffer;
+			if(mem.device != sub_device) {
+				Device *original_device = mem.device;
+				device_ptr original_ptr = mem.device_pointer;
+				size_t original_size = mem.device_size;
+
+				mem.device = sub_device;
+				mem.device_pointer = tiles[i].buffer;
+
+				/* Copy denoised tile to the host. */
+				if(i == 4) {
+					mem.copy_from_device(0, mem.data_size, 1);
+				}
+
+				sub_device->mem_free(mem);
+
+				mem.device = original_device;
+				mem.device_pointer = original_ptr;
+				mem.device_size = original_size;
+
+				/* Copy denoised tile to the original device. */
+				if(i == 4) {
+					mem.copy_to_device();
+				}
+			}
+		}
+	}
+
 	int get_split_task_count(DeviceTask& task)
 	{
 		int total_tasks = 0;
@@ -330,7 +366,6 @@ public:
 				if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
 				if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
 				if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
-				if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
 
 				sub.device->task_add(subtask);
 			}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 53eef6cf199..5ad4405366e 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 #if defined(WITH_NETWORK)
 
@@ -87,16 +87,20 @@ public:
 		snd.write();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem)
 	{
+		if(mem.name) {
+			VLOG(1) << "Buffer allocate: " << mem.name << ", "
+				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+				    << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
 
 		RPCSend snd(socket, &error_func, "mem_alloc");
-
 		snd.add(mem);
-		snd.add(type);
 		snd.write();
 	}
 
@@ -108,7 +112,7 @@ public:
 
 		snd.add(mem);
 		snd.write();
-		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
+		snd.write_buffer(mem.host_pointer, mem.memory_size());
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
@@ -127,7 +131,7 @@ public:
 		snd.write();
 
 		RPCReceive rcv(socket, &error_func);
-		rcv.read_buffer((void*)mem.data_pointer, data_size);
+		rcv.read_buffer(mem.host_pointer, data_size);
 	}
 
 	void mem_zero(device_memory& mem)
@@ -168,45 +172,6 @@ public:
 		snd.write_buffer(host, size);
 	}
 
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType interpolation,
-	               ExtensionType extension)
-	{
-		VLOG(1) << "Texture allocate: " << name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
-		thread_scoped_lock lock(rpc_lock);
-
-		mem.device_pointer = ++mem_counter;
-
-		RPCSend snd(socket, &error_func, "tex_alloc");
-
-		string name_string(name);
-
-		snd.add(name_string);
-		snd.add(mem);
-		snd.add(interpolation);
-		snd.add(extension);
-		snd.write();
-		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
-	}
-
-	void tex_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			thread_scoped_lock lock(rpc_lock);
-
-			RPCSend snd(socket, &error_func, "tex_free");
-
-			snd.add(mem);
-			snd.write();
-
-			mem.device_pointer = 0;
-		}
-	}
-
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		if(error_func.have_error())
@@ -315,7 +280,7 @@ public:
 		snd.write();
 	}
 
-	int get_split_task_count(DeviceTask& task)
+	int get_split_task_count(DeviceTask&)
 	{
 		return 1;
 	}
@@ -337,8 +302,12 @@ void device_network_info(vector<DeviceInfo>& devices)
 	info.description = "Network Device";
 	info.id = "NETWORK";
 	info.num = 0;
-	info.advanced_shading = true; /* todo: get this info from device */
-	info.pack_images = false;
+
+	/* todo: get this info from device */
+	info.advanced_shading = true;
+	info.has_volume_decoupled = false;
+	info.bvh_layout_mask = BVH_LAYOUT_BVH2;
+	info.has_osl = false;
 
 	devices.push_back(info);
 }
@@ -460,61 +429,64 @@ protected:
 	void process(RPCReceive& rcv, thread_scoped_lock &lock)
 	{
 		if(rcv.name == "mem_alloc") {
-			MemoryType type;
-			network_device_memory mem;
-			device_ptr client_pointer;
-
-			rcv.read(mem);
-			rcv.read(type);
-
+			string name;
+			network_device_memory mem(device);
+			rcv.read(mem, name);
 			lock.unlock();
 
-			client_pointer = mem.device_pointer;
-
-			/* create a memory buffer for the device buffer */
+			/* Allocate host side data buffer. */
 			size_t data_size = mem.memory_size();
-			DataVector &data_v = data_vector_insert(client_pointer, data_size);
+			device_ptr client_pointer = mem.device_pointer;
 
-			if(data_size)
-				mem.data_pointer = (device_ptr)&(data_v[0]);
-			else
-				mem.data_pointer = 0;
+			DataVector &data_v = data_vector_insert(client_pointer, data_size);
+			mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
 
-			/* perform the allocation on the actual device */
-			device->mem_alloc(mem, type);
+			/* Perform the allocation on the actual device. */
+			device->mem_alloc(mem);
 
-			/* store a mapping to/from client_pointer and real device pointer */
+			/* Store a mapping to/from client_pointer and real device pointer. */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
 		}
 		else if(rcv.name == "mem_copy_to") {
-			network_device_memory mem;
-
-			rcv.read(mem);
+			string name;
+			network_device_memory mem(device);
+			rcv.read(mem, name);
 			lock.unlock();
 
-			device_ptr client_pointer = mem.device_pointer;
-
-			DataVector &data_v = data_vector_find(client_pointer);
-
 			size_t data_size = mem.memory_size();
+			device_ptr client_pointer = mem.device_pointer;
 
-			/* get pointer to memory buffer	for device buffer */
-			mem.data_pointer = (device_ptr)&data_v[0];
+			if(client_pointer) {
+				/* Lookup existing host side data buffer. */
+				DataVector &data_v = data_vector_find(client_pointer);
+				mem.host_pointer = (void*)&data_v[0];
 
-			/* copy data from network into memory buffer */
-			rcv.read_buffer((uint8_t*)mem.data_pointer, data_size);
+				/* Translate the client pointer to a real device pointer. */
+				mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+			}
+			else {
+				/* Allocate host side data buffer. */
+				DataVector &data_v = data_vector_insert(client_pointer, data_size);
+				mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
+			}
 
-			/* translate the client pointer to a real device pointer */
-			mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+			/* Copy data from network into memory buffer. */
+			rcv.read_buffer((uint8_t*)mem.host_pointer, data_size);
 
-			/* copy the data from the memory buffer to the device buffer */
+			/* Copy the data from the memory buffer to the device buffer. */
 			device->mem_copy_to(mem);
+
+			if(!client_pointer) {
+				/* Store a mapping to/from client_pointer and real device pointer. */
+				pointer_mapping_insert(client_pointer, mem.device_pointer);
+			}
 		}
 		else if(rcv.name == "mem_copy_from") {
-			network_device_memory mem;
+			string name;
+			network_device_memory mem(device);
 			int y, w, h, elem;
 
-			rcv.read(mem);
+			rcv.read(mem, name);
 			rcv.read(y);
 			rcv.read(w);
 			rcv.read(h);
@@ -525,7 +497,7 @@ protected:
 
 			DataVector &data_v = data_vector_find(client_pointer);
 
-			mem.data_pointer = (device_ptr)&(data_v[0]);
+			mem.host_pointer = (device_ptr)&(data_v[0]);
 
 			device->mem_copy_from(mem, y, w, h, elem);
 
@@ -533,32 +505,48 @@ protected:
 
 			RPCSend snd(socket, &error_func, "mem_copy_from");
 			snd.write();
-			snd.write_buffer((uint8_t*)mem.data_pointer, data_size);
+			snd.write_buffer((uint8_t*)mem.host_pointer, data_size);
 			lock.unlock();
 		}
 		else if(rcv.name == "mem_zero") {
-			network_device_memory mem;
-			
-			rcv.read(mem);
+			string name;
+			network_device_memory mem(device);
+			rcv.read(mem, name);
 			lock.unlock();
 
+			size_t data_size = mem.memory_size();
 			device_ptr client_pointer = mem.device_pointer;
-			mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
 
-			DataVector &data_v = data_vector_find(client_pointer);
+			if(client_pointer) {
+				/* Lookup existing host side data buffer. */
+				DataVector &data_v = data_vector_find(client_pointer);
+				mem.host_pointer = (void*)&data_v[0];
 
-			mem.data_pointer = (device_ptr)&(data_v[0]);
+				/* Translate the client pointer to a real device pointer. */
+				mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+			}
+			else {
+				/* Allocate host side data buffer. */
+				DataVector &data_v = data_vector_insert(client_pointer, data_size);
+				mem.host_pointer = (void*)? (device_ptr)&(data_v[0]): 0;
+			}
 
+			/* Zero memory. */
 			device->mem_zero(mem);
+
+			if(!client_pointer) {
+				/* Store a mapping to/from client_pointer and real device pointer. */
+				pointer_mapping_insert(client_pointer, mem.device_pointer);
+			}
 		}
 		else if(rcv.name == "mem_free") {
-			network_device_memory mem;
-			device_ptr client_pointer;
+			string name;
+			network_device_memory mem(device);
 
-			rcv.read(mem);
+			rcv.read(mem, name);
 			lock.unlock();
 
-			client_pointer = mem.device_pointer;
+			device_ptr client_pointer = mem.device_pointer;
 
 			mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
 
@@ -577,49 +565,6 @@ protected:
 
 			device->const_copy_to(name_string.c_str(), &host_vector[0], size);
 		}
-		else if(rcv.name == "tex_alloc") {
-			network_device_memory mem;
-			string name;
-			InterpolationType interpolation;
-			ExtensionType extension_type;
-			device_ptr client_pointer;
-
-			rcv.read(name);
-			rcv.read(mem);
-			rcv.read(interpolation);
-			rcv.read(extension_type);
-			lock.unlock();
-
-			client_pointer = mem.device_pointer;
-
-			size_t data_size = mem.memory_size();
-
-			DataVector &data_v = data_vector_insert(client_pointer, data_size);
-
-			if(data_size)
-				mem.data_pointer = (device_ptr)&(data_v[0]);
-			else
-				mem.data_pointer = 0;
-
-			rcv.read_buffer((uint8_t*)mem.data_pointer, data_size);
-
-			device->tex_alloc(name.c_str(), mem, interpolation, extension_type);
-
-			pointer_mapping_insert(client_pointer, mem.device_pointer);
-		}
-		else if(rcv.name == "tex_free") {
-			network_device_memory mem;
-			device_ptr client_pointer;
-
-			rcv.read(mem);
-			lock.unlock();
-
-			client_pointer = mem.device_pointer;
-
-			mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-			device->tex_free(mem);
-		}
 		else if(rcv.name == "load_kernels") {
 			DeviceRequestedFeatures requested_features;
 			rcv.read(requested_features.experimental);
@@ -655,10 +600,6 @@ protected:
 			if(task.shader_output)
 				task.shader_output = device_ptr_from_client_pointer(task.shader_output);
 
-			if(task.shader_output_luma)
-				task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
-
-
 			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
 			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
 			task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this);
@@ -708,7 +649,7 @@ protected:
 		}
 	}
 
-	bool task_acquire_tile(Device *device, RenderTile& tile)
+	bool task_acquire_tile(Device *, RenderTile& tile)
 	{
 		thread_scoped_lock acquire_lock(acquire_mutex);
 
@@ -732,7 +673,6 @@ protected:
 					tile = entry.tile;
 
 					if(tile.buffer) tile.buffer = ptr_map[tile.buffer];
-					if(tile.rng_state) tile.rng_state = ptr_map[tile.rng_state];
 
 					result = true;
 					break;
@@ -764,7 +704,6 @@ protected:
 		thread_scoped_lock acquire_lock(acquire_mutex);
 
 		if(tile.buffer) tile.buffer = ptr_imap[tile.buffer];
-		if(tile.rng_state) tile.rng_state = ptr_imap[tile.rng_state];
 
 		{
 			thread_scoped_lock lock(rpc_lock);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index d28cfe3121f..b734ba2bda9 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -33,12 +33,13 @@
 #include <sstream>
 #include <deque>
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_string.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -68,8 +69,15 @@ typedef boost::archive::binary_iarchive i_archive;
 class network_device_memory : public device_memory
 {
 public:
-	network_device_memory() {}
-	~network_device_memory() { device_pointer = 0; };
+	network_device_memory(Device *device)
+	: device_memory(device, "", MEM_READ_ONLY)
+	{
+	}
+
+	~network_device_memory()
+	{
+		device_pointer = 0;
+	};
 
 	vector<char> local_data;
 };
@@ -119,6 +127,9 @@ public:
 	{
 		archive & mem.data_type & mem.data_elements & mem.data_size;
 		archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
+		archive & mem.type & string(mem.name);
+		archive & mem.interpolation & mem.extension;
+		archive & mem.device_pointer;
 	}
 
 	template<typename T> void add(const T& data)
@@ -132,7 +143,7 @@ public:
 		archive & type & task.x & task.y & task.w & task.h;
 		archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		archive & task.offset & task.stride;
-		archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
+		archive & task.shader_input & task.shader_output & task.shader_eval_type;
 		archive & task.shader_x & task.shader_w;
 		archive & task.need_finish_queue;
 	}
@@ -142,7 +153,7 @@ public:
 		archive & tile.x & tile.y & tile.w & tile.h;
 		archive & tile.start_sample & tile.num_samples & tile.sample;
 		archive & tile.resolution & tile.offset & tile.stride;
-		archive & tile.buffer & tile.rng_state;
+		archive & tile.buffer;
 	}
 
 	void write()
@@ -258,12 +269,21 @@ public:
 		delete archive_stream;
 	}
 
-	void read(network_device_memory& mem)
+	void read(network_device_memory& mem, string& name)
 	{
 		*archive & mem.data_type & mem.data_elements & mem.data_size;
 		*archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
+		*archive & mem.type & name;
+		*archive & mem.interpolation & mem.extension;
+		*archive & mem.device_pointer;
+
+		mem.name = name.c_str();
+		mem.host_pointer = 0;
 
-		mem.data_pointer = 0;
+		/* Can't transfer OpenGL texture over network. */
+		if(mem.type == MEM_PIXELS) {
+			mem.type = MEM_READ_WRITE;
+		}
 	}
 
 	template<typename T> void read(T& data)
@@ -291,7 +311,7 @@ public:
 		*archive & type & task.x & task.y & task.w & task.h;
 		*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		*archive & task.offset & task.stride;
-		*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
+		*archive & task.shader_input & task.shader_output & task.shader_eval_type;
 		*archive & task.shader_x & task.shader_w;
 		*archive & task.need_finish_queue;
 
@@ -303,7 +323,7 @@ public:
 		*archive & tile.x & tile.y & tile.w & tile.h;
 		*archive & tile.start_sample & tile.num_samples & tile.sample;
 		*archive & tile.resolution & tile.offset & tile.stride;
-		*archive & tile.buffer & tile.rng_state;
+		*archive & tile.buffer;
 
 		tile.buffers = NULL;
 	}
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index ba94c592a5f..9d61bbdae5d 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,12 +16,13 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl/opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "device_intern.h"
+#include "device/device_intern.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -73,13 +74,41 @@ bool device_opencl_init(void)
 	return result;
 }
 
+
+static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
+{
+#ifdef _WIN32
+	__try {
+		return clGetPlatformIDs(0, NULL, num_platforms);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER) {
+		/* Ignore crashes inside the OpenCL driver and hope we can
+		 * survive even with corrupted OpenCL installs. */
+		fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
+	}
+
+	*num_platforms = 0;
+	return CL_DEVICE_NOT_FOUND;
+#else
+	return clGetPlatformIDs(0, NULL, num_platforms);
+#endif
+}
+
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
+	cl_uint num_platforms = 0;
+	device_opencl_get_num_platforms_safe(&num_platforms);
+	if(num_platforms == 0) {
+		return;
+	}
+
 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
 	/* Devices are numbered consecutively across platforms. */
 	int num_devices = 0;
+	set<string> unique_ids;
 	foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
+		/* Compute unique ID for persistent user preferences. */
 		const string& platform_name = platform_device.platform_name;
 		const cl_device_type device_type = platform_device.device_type;
 		const string& device_name = platform_device.device_name;
@@ -87,7 +116,15 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		if(hardware_id == "") {
 			hardware_id = string_printf("ID_%d", num_devices);
 		}
+		string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
+
+		/* Hardware ID might not be unique, add device number in that case. */
+		if(unique_ids.find(id) != unique_ids.end()) {
+			id += string_printf("_ID_%d", num_devices);
+		}
+		unique_ids.insert(id);
 
+		/* Create DeviceInfo. */
 		DeviceInfo info;
 		info.type = DEVICE_OPENCL;
 		info.description = string_remove_trademark(string(device_name));
@@ -95,10 +132,11 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
 		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
-		info.pack_images = true;
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
-		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
+		info.has_volume_decoupled = false;
+		info.bvh_layout_mask = BVH_LAYOUT_BVH2;
+		info.id = id;
 		devices.push_back(info);
 		num_devices++;
 	}
@@ -114,7 +152,7 @@ string device_opencl_capabilities(void)
 	                         * it could also be nicely reported to the console.
 	                         */
 	cl_uint num_platforms = 0;
-	opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms));
+	opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
 	if(num_platforms == 0) {
 		return "No OpenCL platforms found\n";
 	}
@@ -130,10 +168,22 @@ string device_opencl_capabilities(void)
 		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
 		result += string_printf("%s: %s\n", name, data); \
 	} while(false)
+#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+	do { \
+		char data[1024] = "\0"; \
+		size_t length = 0; \
+		if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+			if(length != 0 && data[0] != '\0') { \
+				result += string_printf("%s: %s\n", name, data); \
+			} \
+		} \
+	} while(false)
 #define APPEND_PLATFORM_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
 #define APPEND_DEVICE_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+	APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
 
 	vector<cl_device_id> device_ids;
 	for(cl_uint platform = 0; platform < num_platforms; ++platform) {
@@ -167,6 +217,7 @@ string device_opencl_capabilities(void)
 			result += string_printf("\t\tDevice: #%u\n", device);
 
 			APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+			APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
 			APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
 			APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
 			APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
new file mode 100644
index 00000000000..74135a1f3c8
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_split_kernel.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device)
+: device(device),
+  split_data(device, "split_data"),
+  ray_state(device, "ray_state", MEM_READ_WRITE),
+  queue_index(device, "queue_index"),
+  use_queues_flag(device, "use_queues_flag"),
+  work_pool_wgs(device, "work_pool_wgs")
+{
+	first_tile = true;
+
+	avg_time_per_sample = 0.0;
+
+	kernel_path_init = NULL;
+	kernel_scene_intersect = NULL;
+	kernel_lamp_emission = NULL;
+	kernel_do_volume = NULL;
+	kernel_queue_enqueue = NULL;
+	kernel_indirect_background = NULL;
+	kernel_shader_setup = NULL;
+	kernel_shader_sort = NULL;
+	kernel_shader_eval = NULL;
+	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+	kernel_subsurface_scatter = NULL;
+	kernel_direct_lighting = NULL;
+	kernel_shadow_blocked_ao = NULL;
+	kernel_shadow_blocked_dl = NULL;
+	kernel_enqueue_inactive = NULL;
+	kernel_next_iteration_setup = NULL;
+	kernel_indirect_subsurface = NULL;
+	kernel_buffer_update = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+	split_data.free();
+	ray_state.free();
+	use_queues_flag.free();
+	queue_index.free();
+	work_pool_wgs.free();
+
+	delete kernel_path_init;
+	delete kernel_scene_intersect;
+	delete kernel_lamp_emission;
+	delete kernel_do_volume;
+	delete kernel_queue_enqueue;
+	delete kernel_indirect_background;
+	delete kernel_shader_setup;
+	delete kernel_shader_sort;
+	delete kernel_shader_eval;
+	delete kernel_holdout_emission_blurring_pathtermination_ao;
+	delete kernel_subsurface_scatter;
+	delete kernel_direct_lighting;
+	delete kernel_shadow_blocked_ao;
+	delete kernel_shadow_blocked_dl;
+	delete kernel_enqueue_inactive;
+	delete kernel_next_iteration_setup;
+	delete kernel_indirect_subsurface;
+	delete kernel_buffer_update;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+		kernel_##name = get_split_kernel_function(#name, requested_features); \
+		if(!kernel_##name) { \
+			device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
+			return false; \
+		}
+
+	LOAD_KERNEL(path_init);
+	LOAD_KERNEL(scene_intersect);
+	LOAD_KERNEL(lamp_emission);
+	LOAD_KERNEL(do_volume);
+	LOAD_KERNEL(queue_enqueue);
+	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_setup);
+	LOAD_KERNEL(shader_sort);
+	LOAD_KERNEL(shader_eval);
+	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+	LOAD_KERNEL(subsurface_scatter);
+	LOAD_KERNEL(direct_lighting);
+	LOAD_KERNEL(shadow_blocked_ao);
+	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(enqueue_inactive);
+	LOAD_KERNEL(next_iteration_setup);
+	LOAD_KERNEL(indirect_subsurface);
+	LOAD_KERNEL(buffer_update);
+
+#undef LOAD_KERNEL
+
+	return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+{
+	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	VLOG(1) << "Split state element size: "
+	        << string_human_readable_number(size_per_element) << " bytes. ("
+	        << string_human_readable_size(size_per_element) << ").";
+	return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+                                   RenderTile& tile,
+                                   device_memory& kgbuffer,
+                                   device_memory& kernel_data)
+{
+	if(device->have_error()) {
+		return false;
+	}
+
+	/* Get local size */
+	size_t local_size[2];
+	{
+		int2 lsize = split_kernel_local_size();
+		local_size[0] = lsize[0];
+		local_size[1] = lsize[1];
+	}
+
+	/* Number of elements in the global state buffer */
+	int num_global_elements = global_size[0] * global_size[1];
+
+	/* Allocate all required global memory once. */
+	if(first_tile) {
+		first_tile = false;
+
+		/* Set gloabl size */
+		{
+			int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+			/* Make sure that set work size is a multiple of local
+			 * work size dimensions.
+			 */
+			global_size[0] = round_up(gsize[0], local_size[0]);
+			global_size[1] = round_up(gsize[1], local_size[1]);
+		}
+
+		num_global_elements = global_size[0] * global_size[1];
+		assert(num_global_elements % WORK_POOL_SIZE == 0);
+
+		/* Calculate max groups */
+
+		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+		unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU;
+		unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
+
+		/* Allocate work_pool_wgs memory. */
+		work_pool_wgs.alloc_to_device(max_work_groups);
+		queue_index.alloc_to_device(NUM_QUEUES);
+		use_queues_flag.alloc_to_device(1);
+		split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+		ray_state.alloc(num_global_elements);
+	}
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+		if(device->have_error()) { \
+			return false; \
+		} \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+			return false; \
+		}
+
+	tile.sample = tile.start_sample;
+
+	/* for exponential increase between tile updates */
+	int time_multiplier = 1;
+
+	while(tile.sample < tile.start_sample + tile.num_samples) {
+		/* to keep track of how long it takes to run a number of samples */
+		double start_time = time_dt();
+
+		/* initial guess to start rolling average */
+		const int initial_num_samples = 1;
+		/* approx number of samples per second */
+		int samples_per_second = (avg_time_per_sample > 0.0) ?
+		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+		RenderTile subtile = tile;
+		subtile.start_sample = tile.sample;
+		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+		if(device->have_error()) {
+			return false;
+		}
+
+		/* reset state memory here as global size for data_init
+		 * kernel might not be large enough to do in kernel
+		 */
+		work_pool_wgs.zero_to_device();
+		split_data.zero_to_device();
+		ray_state.zero_to_device();
+
+		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+		                                   subtile,
+		                                   num_global_elements,
+		                                   kgbuffer,
+		                                   kernel_data,
+		                                   split_data,
+		                                   ray_state,
+		                                   queue_index,
+		                                   use_queues_flag,
+		                                   work_pool_wgs))
+		{
+			return false;
+		}
+
+		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+		bool activeRaysAvailable = true;
+		double cancel_time = DBL_MAX;
+
+		while(activeRaysAvailable) {
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < 16; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+				if(task->get_cancel() && cancel_time == DBL_MAX) {
+					/* Wait up to twice as many seconds for current samples to finish 
+					 * to avoid artifacts in render result from ending too soon.
+					 */
+					cancel_time = time_dt() + 2.0 * time_multiplier;
+				}
+
+				if(time_dt() > cancel_time) {
+					return true;
+				}
+			}
+
+			/* Decide if we should exit path-iteration in host. */
+			ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+				if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
+					if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
+						/* Something went wrong, abort to avoid looping endlessly. */
+						device->set_error("Split kernel error: invalid ray state");
+						return false;
+					}
+
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(time_dt() > cancel_time) {
+				return true;
+			}
+		}
+
+		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+		if(avg_time_per_sample == 0.0) {
+			/* start rolling average */
+			avg_time_per_sample = time_per_sample;
+		}
+		else {
+			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+		tile.sample += subtile.num_samples;
+		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+		time_multiplier = min(time_multiplier << 1, 10);
+
+		if(task->get_cancel()) {
+			return true;
+		}
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
new file mode 100644
index 00000000000..2ec0261e847
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device/device.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+	size_t global_size[2];
+	size_t local_size[2];
+
+	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+	{
+		memcpy(global_size, global_size_, sizeof(global_size));
+		memcpy(local_size, local_size_, sizeof(local_size));
+	}
+};
+
+class SplitKernelFunction {
+public:
+	virtual ~SplitKernelFunction() {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+	Device *device;
+
+	SplitKernelFunction *kernel_path_init;
+	SplitKernelFunction *kernel_scene_intersect;
+	SplitKernelFunction *kernel_lamp_emission;
+	SplitKernelFunction *kernel_do_volume;
+	SplitKernelFunction *kernel_queue_enqueue;
+	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_setup;
+	SplitKernelFunction *kernel_shader_sort;
+	SplitKernelFunction *kernel_shader_eval;
+	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+	SplitKernelFunction *kernel_subsurface_scatter;
+	SplitKernelFunction *kernel_direct_lighting;
+	SplitKernelFunction *kernel_shadow_blocked_ao;
+	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_enqueue_inactive;
+	SplitKernelFunction *kernel_next_iteration_setup;
+	SplitKernelFunction *kernel_indirect_subsurface;
+	SplitKernelFunction *kernel_buffer_update;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	device_only_memory<uchar> split_data;
+	device_vector<uchar> ray_state;
+	device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	device_only_memory<char> use_queues_flag;
+
+	/* Approximate time it takes to complete one sample */
+	double avg_time_per_sample;
+
+	/* Work pool with respect to each work group. */
+	device_only_memory<unsigned int> work_pool_wgs;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+	/* Cached global size */
+	size_t global_size[2];
+
+public:
+	explicit DeviceSplitKernel(Device* device);
+	virtual ~DeviceSplitKernel();
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	bool path_trace(DeviceTask *task,
+	                RenderTile& rtile,
+	                device_memory& kgbuffer,
+	                device_memory& kernel_data);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs) = 0;
+
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&) = 0;
+	virtual int2 split_kernel_local_size() = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 48d18035c13..3c7d24fb5b7 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -17,12 +17,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device_task.h"
+#include "device/device_task.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_algorithm.h"
-#include "util_time.h"
+#include "util/util_algorithm.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,7 +31,7 @@ CCL_NAMESPACE_BEGIN
 DeviceTask::DeviceTask(Type type_)
 : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
-  shader_input(0), shader_output(0), shader_output_luma(0),
+  shader_input(0), shader_output(0),
   shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0)
 {
 	last_update_time = time_dt();
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
 	if(type == SHADER) {
 		num = min(shader_w, num);
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 	}
 	else {
 		num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 			tasks.push_back(task);
 		}
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 		for(int i = 0; i < num; i++)
 			tasks.push_back(*this);
 	}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-	if((type != PATH_TRACE) &&
+	if((type != RENDER) &&
 	   (type != SHADER))
 		return;
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8bd54c3d2b0..b9658eb978f 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -17,11 +17,11 @@
 #ifndef __DEVICE_TASK_H__
 #define __DEVICE_TASK_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "util_function.h"
-#include "util_list.h"
-#include "util_task.h"
+#include "util/util_function.h"
+#include "util/util_list.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -34,7 +34,7 @@ class Tile;
 
 class DeviceTask : public Task {
 public:
-	typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+	typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
 	Type type;
 
 	int x, y, w, h;
@@ -46,12 +46,14 @@ public:
 	int offset, stride;
 
 	device_ptr shader_input;
-	device_ptr shader_output, shader_output_luma;
+	device_ptr shader_output;
 	int shader_eval_type;
 	int shader_filter;
 	int shader_x, shader_w;
 
-	explicit DeviceTask(Type type = PATH_TRACE);
+	int passes_size;
+
+	explicit DeviceTask(Type type = RENDER);
 
 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -63,6 +65,16 @@ public:
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
+	function<void(RenderTile*, Device*)> map_neighbor_tiles;
+	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+	int pass_stride;
+	int pass_denoising_data;
+	int pass_denoising_clean;
 
 	bool need_finish_queue;
 	bool integrator_branched;
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
new file mode 100644
index 00000000000..75c9de65035
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "util/util_foreach.h"
+
+#include "device/opencl/opencl.h"
+#include "device/opencl/memory_manager.h"
+
+CCL_NAMESPACE_BEGIN
+
+void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+{
+	allocations.push_back(&allocation);
+}
+
+void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device)
+{
+	bool need_realloc = false;
+
+	/* Calculate total size and remove any freed. */
+	size_t total_size = 0;
+
+	for(int i = allocations.size()-1; i >= 0; i--) {
+		Allocation* allocation = allocations[i];
+
+		/* Remove allocations that have been freed. */
+		if(!allocation->mem || allocation->mem->memory_size() == 0) {
+			allocation->device_buffer = NULL;
+			allocation->size = 0;
+
+			allocations.erase(allocations.begin()+i);
+
+			need_realloc = true;
+
+			continue;
+		}
+
+		/* Get actual size for allocation. */
+		size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+		if(allocation->size != alloc_size) {
+			/* Allocation is either new or resized. */
+			allocation->size = alloc_size;
+			allocation->needs_copy_to_device = true;
+
+			need_realloc = true;
+		}
+
+		total_size += alloc_size;
+	}
+
+	if(need_realloc) {
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(total_size > max_buffer_size) {
+			device->set_error("Scene too complex to fit in available memory.");
+			return;
+		}
+
+		device_only_memory<uchar> *new_buffer =
+			new device_only_memory<uchar>(device, "memory manager buffer");
+
+		new_buffer->alloc_to_device(total_size);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(new_buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					allocation->mem->host_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+			else {
+				/* Fast copy from memory already on device. */
+				opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_MEM_PTR(new_buffer->device_pointer),
+					allocation->desc.offset,
+					offset,
+					allocation->mem->memory_size(),
+					0, NULL, NULL
+				));
+			}
+
+			allocation->desc.offset = offset;
+			offset += allocation->size;
+		}
+
+		delete buffer;
+
+		buffer = new_buffer;
+	}
+	else {
+		assert(total_size == buffer->data_size);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					allocation->mem->host_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+
+			offset += allocation->size;
+		}
+	}
+
+	/* Not really necessary, but seems to improve responsiveness for some reason. */
+	clFinish(device->cqCommandQueue);
+}
+
+void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *)
+{
+	buffer->free();
+}
+
+MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+{
+	DeviceBuffer* smallest = device_buffers;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.size < smallest->size) {
+			smallest = &device_buffer;
+		}
+	}
+
+	return smallest;
+}
+
+MemoryManager::MemoryManager(OpenCLDeviceBase *device)
+: device(device), need_update(false)
+{
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.buffer =
+			new device_only_memory<uchar>(device, "memory manager buffer");
+	}
+}
+
+void MemoryManager::free()
+{
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.free(device);
+	}
+}
+
+void MemoryManager::alloc(const char *name, device_memory& mem)
+{
+	Allocation& allocation = allocations[name];
+
+	allocation.mem = &mem;
+	allocation.needs_copy_to_device = true;
+
+	if(!allocation.device_buffer) {
+		DeviceBuffer* device_buffer = smallest_device_buffer();
+		allocation.device_buffer = device_buffer;
+
+		allocation.desc.device_buffer = device_buffer - device_buffers;
+
+		device_buffer->add_allocation(allocation);
+
+		device_buffer->size += mem.memory_size();
+	}
+
+	need_update = true;
+}
+
+bool MemoryManager::free(device_memory& mem)
+{
+	foreach(AllocationsMap::value_type& value, allocations) {
+		Allocation& allocation = value.second;
+		if(allocation.mem == &mem) {
+
+			allocation.device_buffer->size -= mem.memory_size();
+
+			allocation.mem = NULL;
+			allocation.needs_copy_to_device = false;
+
+			need_update = true;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
+{
+	update_device_memory();
+
+	Allocation& allocation = allocations[name];
+	return allocation.desc;
+}
+
+void MemoryManager::update_device_memory()
+{
+	if(!need_update) {
+		return;
+	}
+
+	need_update = false;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.update_device_memory(device);
+	}
+}
+
+void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	update_device_memory();
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.buffer->device_pointer) {
+			device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+		}
+		else {
+			device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* WITH_OPENCL */
+
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
new file mode 100644
index 00000000000..b3d861275f0
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device.h"
+
+#include "util/util_map.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
+
+#include "clew.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OpenCLDeviceBase;
+
+class MemoryManager {
+public:
+	static const int NUM_DEVICE_BUFFERS = 8;
+
+	struct BufferDescriptor {
+		uint device_buffer;
+		cl_ulong offset;
+	};
+
+private:
+	struct DeviceBuffer;
+
+	struct Allocation {
+		device_memory *mem;
+
+		DeviceBuffer *device_buffer;
+		size_t size; /* Size of actual allocation, may be larger than requested. */
+
+		BufferDescriptor desc;
+
+		bool needs_copy_to_device;
+
+		Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
+		{
+		}
+	};
+
+	struct DeviceBuffer {
+		device_only_memory<uchar> *buffer;
+		vector<Allocation*> allocations;
+		size_t size; /* Size of all allocations. */
+
+		DeviceBuffer()
+		: buffer(NULL), size(0)
+		{
+		}
+
+		~DeviceBuffer()
+		{
+			delete buffer;
+			buffer = NULL;
+		}
+
+		void add_allocation(Allocation& allocation);
+
+		void update_device_memory(OpenCLDeviceBase *device);
+
+		void free(OpenCLDeviceBase *device);
+	};
+
+	OpenCLDeviceBase *device;
+
+	DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
+
+	typedef unordered_map<string, Allocation> AllocationsMap;
+	AllocationsMap allocations;
+
+	bool need_update;
+
+	DeviceBuffer* smallest_device_buffer();
+
+public:
+	MemoryManager(OpenCLDeviceBase *device);
+
+	void free(); /* Free all memory. */
+
+	void alloc(const char *name, device_memory& mem);
+	bool free(device_memory& mem);
+
+	BufferDescriptor get_descriptor(string name);
+
+	void update_device_memory();
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+};
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4023ba89a10..85ef14ee29a 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,39 +16,42 @@
 
 #ifdef WITH_OPENCL
 
-#include "device.h"
+#include "device/device.h"
+#include "device/device_denoising.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
 
 #include "clew.h"
 
-CCL_NAMESPACE_BEGIN
+#include "device/opencl/memory_manager.h"
 
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+CCL_NAMESPACE_BEGIN
 
-/* Macro declarations used with split kernel */
+/* Disable workarounds, seems to be working fine on latest drivers. */
+#define CYCLES_DISABLE_DRIVER_WORKAROUNDS
 
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+#  undef clEnqueueNDRangeKernel
+#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
 
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+#  undef clEnqueueWriteBuffer
+#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
 
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
+#  undef clEnqueueReadBuffer
+#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
 
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
@@ -86,10 +89,65 @@ public:
 	                                   string *error = NULL);
 	static bool device_version_check(cl_device_id device,
 	                                 string *error = NULL);
-	static string get_hardware_id(string platform_name,
+	static string get_hardware_id(const string& platform_name,
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
+	static bool use_single_program();
+
+	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+	/* Platform information. */
+	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+	static cl_uint get_num_platforms();
+
+	static bool get_platforms(vector<cl_platform_id> *platform_ids,
+	                          cl_int *error = NULL);
+	static vector<cl_platform_id> get_platforms();
+
+	static bool get_platform_name(cl_platform_id platform_id,
+	                              string *platform_name);
+	static string get_platform_name(cl_platform_id platform_id);
+
+	static bool get_num_platform_devices(cl_platform_id platform_id,
+	                                     cl_device_type device_type,
+	                                     cl_uint *num_devices,
+	                                     cl_int *error = NULL);
+	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
+	                                        cl_device_type device_type);
+
+	static bool get_platform_devices(cl_platform_id platform_id,
+	                                 cl_device_type device_type,
+	                                 vector<cl_device_id> *device_ids,
+	                                 cl_int* error = NULL);
+	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+	                                                 cl_device_type device_type);
+
+	/* Device information. */
+	static bool get_device_name(cl_device_id device_id,
+	                            string *device_name,
+	                            cl_int* error = NULL);
+
+	static string get_device_name(cl_device_id device_id);
+
+	static bool get_device_type(cl_device_id device_id,
+	                            cl_device_type *device_type,
+	                            cl_int* error = NULL);
+	static cl_device_type get_device_type(cl_device_id device_id);
+
+	static bool get_driver_version(cl_device_id device_id,
+	                               int *major,
+	                               int *minor,
+	                               cl_int* error = NULL);
+
+	static int mem_sub_ptr_alignment(cl_device_id device_id);
+
+	/* Get somewhat more readable device name.
+	 * Main difference is AMD OpenCL here which only gives code name
+	 * for the regular device name. This will give more sane device
+	 * name using some extensions.
+	 */
+	static string get_readable_device_name(cl_device_id device_id);
 };
 
 /* Thread safe cache for contexts and programs.
@@ -168,12 +226,24 @@ public:
 	static string get_kernel_md5();
 };
 
+#define opencl_device_assert(device, stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+			if((device)->error_message() == "") \
+				(device)->set_error(message); \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
 #define opencl_assert(stmt) \
 	{ \
 		cl_int err = stmt; \
 		\
 		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -194,17 +264,17 @@ public:
 	public:
 		OpenCLProgram() : loaded(false), device(NULL) {}
 		OpenCLProgram(OpenCLDeviceBase *device,
-		              string program_name,
-		              string kernel_name,
-		              string kernel_build_options,
+		              const string& program_name,
+		              const string& kernel_name,
+		              const string& kernel_build_options,
 		              bool use_stdout = true);
 		~OpenCLProgram();
 
 		void add_kernel(ustring name);
 		void load();
 
-		bool is_loaded()    { return loaded; }
-		string get_log()    { return log; }
+		bool is_loaded() const { return loaded; }
+		const string& get_log() const { return log; }
 		void report_error();
 
 		cl_kernel operator()();
@@ -218,8 +288,8 @@ public:
 		bool load_binary(const string& clbin, const string *debug_src = NULL);
 		bool save_binary(const string& clbin);
 
-		void add_log(string msg, bool is_debug);
-		void add_error(string msg);
+		void add_log(const string& msg, bool is_debug);
+		void add_error(const string& msg);
 
 		bool loaded;
 		cl_program program;
@@ -237,7 +307,7 @@ public:
 		map<ustring, cl_kernel> kernels;
 	};
 
-	OpenCLProgram base_program;
+	OpenCLProgram base_program, denoising_program;
 
 	typedef map<string, device_vector<uchar>*> ConstMemMap;
 	typedef map<string, device_ptr> MemMap;
@@ -248,6 +318,7 @@ public:
 
 	bool device_initialized;
 	string platform_name;
+	string device_name;
 
 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -266,28 +337,33 @@ public:
 
 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;
 
-	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_alloc(device_memory& mem);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
 	void mem_free(device_memory& mem);
+
+	int mem_sub_ptr_alignment();
+
 	void const_copy_to(const char *name, void *host, size_t size);
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType /*interpolation*/,
-	               ExtensionType /*extension*/);
+	void tex_alloc(device_memory& mem);
 	void tex_free(device_memory& mem);
 
 	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
+	                    bool x_workgroups = false,
+	                    size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
 
+	void denoise(RenderTile& tile, DenoisingTask& denoising, const DeviceTask& task);
+
 	class OpenCLDeviceTask : public DeviceTask {
 	public:
 		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -321,21 +397,91 @@ public:
 
 	virtual void thread_run(DeviceTask * /*task*/) = 0;
 
+	virtual bool is_split_kernel() = 0;
+
 protected:
 	string kernel_build_options(const string *debug_src = NULL);
 
+	void mem_zero_kernel(device_ptr ptr, size_t size);
+
+	bool denoising_non_local_means(device_ptr image_ptr,
+	                               device_ptr guide_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr out_ptr,
+	                               DenoisingTask *task);
+	bool denoising_construct_transform(DenoisingTask *task);
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task);
+	bool denoising_combine_halves(device_ptr a_ptr,
+	                              device_ptr b_ptr,
+	                              device_ptr mean_ptr,
+	                              device_ptr variance_ptr,
+	                              int r, int4 rect,
+	                              DenoisingTask *task);
+	bool denoising_divide_shadow(device_ptr a_ptr,
+	                             device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr,
+	                             device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr,
+	                             DenoisingTask *task);
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task);
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task);
+	bool denoising_set_tiles(device_ptr *buffers,
+	                         DenoisingTask *task);
+
+	device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
+	void mem_free_sub_ptr(device_ptr ptr);
+
 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)),
+		                                                   pointer((void*)(&argument.device_pointer))
+		{
+		}
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
@@ -398,6 +544,21 @@ protected:
 
 	virtual string build_options_for_base_program(
 	        const DeviceRequestedFeatures& /*requested_features*/);
+
+private:
+	MemoryManager memory_manager;
+	friend class MemoryManager;
+
+	static_assert_align(TextureInfo, 16);
+	device_vector<TextureInfo> texture_info;
+
+	typedef map<string, device_memory*> TexturesMap;
+	TexturesMap textures;
+
+	bool textures_need_update;
+
+protected:
+	void flush_texture_buffers();
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e7..bfa2702ad62 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -16,18 +16,29 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
+struct texture_slot_t {
+	texture_slot_t(const string& name, int slot)
+		: name(name),
+		  slot(slot) {
+	}
+	string name;
+	int slot;
+};
+
 bool OpenCLDeviceBase::opencl_error(cl_int err)
 {
 	if(err != CL_SUCCESS) {
@@ -62,7 +73,9 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
 }
 
 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
-: Device(info, stats, background_)
+: Device(info, stats, background_),
+  memory_manager(this),
+  texture_info(this, "__texture_info", MEM_TEXTURE)
 {
 	cpPlatform = NULL;
 	cdDevice = NULL;
@@ -70,6 +83,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cqCommandQueue = NULL;
 	null_mem = 0;
 	device_initialized = false;
+	textures_need_update = true;
 
 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
@@ -82,9 +96,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
+	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << platform_device.device_name << ".";
+	        << device_name << ".";
 
 	{
 		/* try to use cached context */
@@ -113,12 +128,20 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}
 
 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating command queue");
 		return;
+	}
 
 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating memory buffer for NULL");
 		return;
+	}
+
+	/* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
+	texture_info.resize(1);
+	memory_manager.alloc("texture_info", texture_info);
 
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -128,12 +151,13 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 {
 	task_pool.stop();
 
+	memory_manager.free();
+
 	if(null_mem)
 		clReleaseMemObject(CL_MEM_PTR(null_mem));
 
 	ConstMemMap::iterator mt;
 	for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-		mem_free(*(mt->second));
 		delete mt->second;
 	}
 
@@ -147,10 +171,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
 	const void * /*private_info*/, size_t /*cb*/, void *user_data)
 {
-	char name[256];
-	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-
-	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+	string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+	fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
 }
 
 bool OpenCLDeviceBase::opencl_version_check()
@@ -191,6 +213,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
 
 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
+	VLOG(2) << "Loading kernels for platform " << platform_name
+	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -204,13 +228,33 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
 	base_program.add_kernel(ustring("convert_to_byte"));
 	base_program.add_kernel(ustring("convert_to_half_float"));
-	base_program.add_kernel(ustring("shader"));
+	base_program.add_kernel(ustring("displace"));
+	base_program.add_kernel(ustring("background"));
 	base_program.add_kernel(ustring("bake"));
+	base_program.add_kernel(ustring("zero_buffer"));
+
+	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+	denoising_program.add_kernel(ustring("filter_divide_shadow"));
+	denoising_program.add_kernel(ustring("filter_get_feature"));
+	denoising_program.add_kernel(ustring("filter_detect_outliers"));
+	denoising_program.add_kernel(ustring("filter_combine_halves"));
+	denoising_program.add_kernel(ustring("filter_construct_transform"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+	denoising_program.add_kernel(ustring("filter_nlm_blur"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+	denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+	denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+	denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+	denoising_program.add_kernel(ustring("filter_finalize"));
+	denoising_program.add_kernel(ustring("filter_set_tiles"));
 
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
+	programs.push_back(&denoising_program);
 	/* Call actual class to fill the vector with its programs. */
-	load_kernels(requested_features, programs);
+	if(!load_kernels(requested_features, programs)) {
+		return false;
+	}
 
 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -242,17 +286,40 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }
 
-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(device_memory& mem)
 {
+	if(mem.name) {
+		VLOG(1) << "Buffer allocate: " << mem.name << ", "
+			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			    << string_human_readable_size(mem.memory_size()) << ")";
+	}
+
 	size_t size = mem.memory_size();
 
+	/* check there is enough memory available for the allocation */
+	cl_ulong max_alloc_size = 0;
+	clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+	if(DebugFlags().opencl.mem_limit) {
+		max_alloc_size = min(max_alloc_size,
+		                     cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+	}
+
+	if(size > max_alloc_size) {
+		string error = "Scene too complex to fit in available memory.";
+		if(mem.name != NULL) {
+			error += string_printf(" (allocating buffer %s failed.)", mem.name);
+		}
+		set_error(error);
+
+		return;
+	}
+
 	cl_mem_flags mem_flag;
 	void *mem_ptr = NULL;
 
-	if(type == MEM_READ_ONLY)
+	if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
 		mem_flag = CL_MEM_READ_ONLY;
-	else if(type == MEM_WRITE_ONLY)
-		mem_flag = CL_MEM_WRITE_ONLY;
 	else
 		mem_flag = CL_MEM_READ_WRITE;
 
@@ -279,17 +346,27 @@ void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
 
 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
 {
-	/* this is blocking */
-	size_t size = mem.memory_size();
-	if(size != 0) {
-		opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-		                                   CL_MEM_PTR(mem.device_pointer),
-		                                   CL_TRUE,
-		                                   0,
-		                                   size,
-		                                   (void*)mem.data_pointer,
-		                                   0,
-		                                   NULL, NULL));
+	if(mem.type == MEM_TEXTURE) {
+		tex_free(mem);
+		tex_alloc(mem);
+	}
+	else {
+		if(!mem.device_pointer) {
+			mem_alloc(mem);
+		}
+
+		/* this is blocking */
+		size_t size = mem.memory_size();
+		if(size != 0) {
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   size,
+			                                   mem.host_pointer,
+			                                   0,
+			                                   NULL, NULL));
+		}
 	}
 }
 
@@ -303,76 +380,178 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 	                                  CL_TRUE,
 	                                  offset,
 	                                  size,
-	                                  (uchar*)mem.data_pointer + offset,
+	                                  (uchar*)mem.host_pointer + offset,
 	                                  0,
 	                                  NULL, NULL));
 }
 
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
+{
+	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+	size_t global_size[] = {1024, 1024};
+	size_t num_threads = global_size[0] * global_size[1];
+
+	cl_mem d_buffer = CL_MEM_PTR(mem);
+	cl_ulong d_offset = 0;
+	cl_ulong d_size = 0;
+
+	while(d_offset < size) {
+		d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
+
+		kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+		ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+		                               ckZeroBuffer,
+		                               2,
+		                               NULL,
+		                               global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+		opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+		d_offset += d_size;
+	}
+}
+
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
+	if(!mem.device_pointer) {
+		mem_alloc(mem);
+	}
+
 	if(mem.device_pointer) {
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
-		mem_copy_to(mem);
+		if(base_program.is_loaded()) {
+			mem_zero_kernel(mem.device_pointer, mem.memory_size());
+		}
+
+		if(mem.host_pointer) {
+			memset(mem.host_pointer, 0, mem.memory_size());
+		}
+
+		if(!base_program.is_loaded()) {
+			void* zero = mem.host_pointer;
+
+			if(!mem.host_pointer) {
+				zero = util_aligned_malloc(mem.memory_size(), 16);
+				memset(zero, 0, mem.memory_size());
+			}
+
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   mem.memory_size(),
+			                                   zero,
+			                                   0,
+			                                   NULL, NULL));
+
+			if(!mem.host_pointer) {
+				util_aligned_free(zero);
+			}
+		}
 	}
 }
 
 void OpenCLDeviceBase::mem_free(device_memory& mem)
 {
-	if(mem.device_pointer) {
-		if(mem.device_pointer != null_mem) {
-			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+	if(mem.type == MEM_TEXTURE) {
+		tex_free(mem);
+	}
+	else {
+		if(mem.device_pointer) {
+			if(mem.device_pointer != null_mem) {
+				opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+			}
+			mem.device_pointer = 0;
+
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
 		}
-		mem.device_pointer = 0;
+	}
+}
+
+int OpenCLDeviceBase::mem_sub_ptr_alignment()
+{
+	return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
+{
+	cl_mem_flags mem_flag;
+	if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+		mem_flag = CL_MEM_READ_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	cl_buffer_region info;
+	info.origin = mem.memory_elements_size(offset);
+	info.size = mem.memory_elements_size(size);
+
+	device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+	                                                    mem_flag,
+	                                                    CL_BUFFER_CREATE_TYPE_REGION,
+	                                                    &info,
+	                                                    &ciErr);
+	opencl_assert_err(ciErr, "clCreateSubBuffer");
+	return sub_buf;
+}
 
-		stats.mem_free(mem.device_size);
-		mem.device_size = 0;
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+	if(device_pointer && device_pointer != null_mem) {
+		opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
 	}
 }
 
 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 {
 	ConstMemMap::iterator i = const_mem_map.find(name);
+	device_vector<uchar> *data;
 
 	if(i == const_mem_map.end()) {
-		device_vector<uchar> *data = new device_vector<uchar>();
-		data->copy((uchar*)host, size);
-
-		mem_alloc(*data, MEM_READ_ONLY);
-		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
+		data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+		data->alloc(size);
+		const_mem_map.insert(ConstMemMap::value_type(name, data));
 	}
 	else {
-		device_vector<uchar> *data = i->second;
-		data->copy((uchar*)host, size);
+		data = i->second;
 	}
 
-	mem_copy_to(*i->second);
+	memcpy(data->data(), host, size);
+	data->copy_to_device();
 }
 
-void OpenCLDeviceBase::tex_alloc(const char *name,
-               device_memory& mem,
-               InterpolationType /*interpolation*/,
-               ExtensionType /*extension*/)
+void OpenCLDeviceBase::tex_alloc(device_memory& mem)
 {
-	VLOG(1) << "Texture allocate: " << name << ", "
+	VLOG(1) << "Texture allocate: " << mem.name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(mem, MEM_READ_ONLY);
-	mem_copy_to(mem);
-	assert(mem_map.find(name) == mem_map.end());
-	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
+
+	memory_manager.alloc(mem.name, mem);
+	/* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
+	mem.device_pointer = 1;
+	textures[mem.name] = &mem;
+	textures_need_update = true;
 }
 
 void OpenCLDeviceBase::tex_free(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		foreach(const MemMap::value_type& value, mem_map) {
-			if(value.second == mem.device_pointer) {
-				mem_map.erase(value.first);
+		mem.device_pointer = 0;
+
+		if(memory_manager.free(mem)) {
+			textures_need_update = true;
+		}
+
+		foreach(TexturesMap::value_type& value, textures) {
+			if(value.second == &mem) {
+				textures.erase(value.first);
 				break;
 			}
 		}
-
-		mem_free(mem);
 	}
 }
 
@@ -382,7 +561,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
 	return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
 {
 	size_t workgroup_size, max_work_items[3];
 
@@ -391,9 +570,20 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
 	clGetDeviceInfo(cdDevice,
 		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
 
+	if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+		workgroup_size = max_workgroup_size;
+	}
+
 	/* Try to divide evenly over 2 dimensions. */
-	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+	size_t local_size[2];
+	if(x_workgroups) {
+		local_size[0] = workgroup_size;
+		local_size[1] = 1;
+	}
+	else {
+		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+		local_size[0] = local_size[1] = sqrt_workgroup_size;
+	}
 
 	/* Some implementations have max size 1 on 2nd dimension. */
 	if(local_size[1] > max_work_items[1]) {
@@ -434,6 +624,76 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const
 	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
 }
 
+void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	flush_texture_buffers();
+
+	memory_manager.set_kernel_arg_buffers(kernel, narg);
+}
+
+void OpenCLDeviceBase::flush_texture_buffers()
+{
+	if(!textures_need_update) {
+		return;
+	}
+	textures_need_update = false;
+
+	/* Setup slots for textures. */
+	int num_slots = 0;
+
+	vector<texture_slot_t> texture_slots;
+
+#define KERNEL_TEX(type, name) \
+	if(textures.find(#name) != textures.end()) { \
+		texture_slots.push_back(texture_slot_t(#name, num_slots)); \
+	} \
+	num_slots++;
+#include "kernel/kernel_textures.h"
+
+	int num_data_slots = num_slots;
+
+	foreach(TexturesMap::value_type& tex, textures) {
+		string name = tex.first;
+
+		if(string_startswith(name, "__tex_image")) {
+			int pos = name.rfind("_");
+			int id = atoi(name.data() + pos + 1);
+			texture_slots.push_back(texture_slot_t(name,
+				                                   num_data_slots + id));
+			num_slots = max(num_slots, num_data_slots + id + 1);
+		}
+	}
+
+	/* Realloc texture descriptors buffer. */
+	memory_manager.free(texture_info);
+	texture_info.resize(num_slots);
+	memory_manager.alloc("texture_info", texture_info);
+
+	/* Fill in descriptors */
+	foreach(texture_slot_t& slot, texture_slots) {
+		TextureInfo& info = texture_info[slot.slot];
+
+		MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
+		info.data = desc.offset;
+		info.cl_buffer = desc.device_buffer;
+
+		if(string_startswith(slot.name, "__tex_image")) {
+			device_memory *mem = textures[slot.name];
+
+			info.width = mem->data_width;
+			info.height = mem->data_height;
+			info.depth = mem->data_depth;
+
+			info.interpolation = mem->interpolation;
+			info.extension = mem->extension;
+		}
+	}
+
+	/* Force write of descriptors. */
+	memory_manager.free(texture_info);
+	memory_manager.alloc("texture_info", texture_info);
+}
+
 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
 {
 	/* cast arguments to cl types */
@@ -458,10 +718,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
 		                d_rgba,
 		                d_buffer);
 
-#define KERNEL_TEX(type, ttype, name) \
-set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
 	                                   start_arg_index,
@@ -476,13 +733,381 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 }
 
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+                                                 device_ptr guide_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr out_ptr,
+                                                 DenoisingTask *task)
+{
+
+	int stride = task->buffer.stride;
+	int w = task->buffer.width;
+	int h = task->buffer.h;
+	int r = task->nlm_state.r;
+	int f = task->nlm_state.f;
+	float a = task->nlm_state.a;
+	float k_2 = task->nlm_state.k_2;
+
+	int shift_stride = stride*h;
+	int num_shifts = (2*r+1)*(2*r+1);
+	int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+	cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+	cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+	cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+	mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+	mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
+	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
+
+	kernel_set_args(ckNLMCalcDifference, 0,
+	                guide_mem,
+	                variance_mem,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                r, 0, a, k_2);
+	kernel_set_args(ckNLMBlur, 0,
+	                difference,
+	                blurDifference,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+	kernel_set_args(ckNLMCalcWeight, 0,
+	                blurDifference,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+	kernel_set_args(ckNLMUpdateOutput, 0,
+	                blurDifference,
+	                image_mem,
+	                out_mem,
+	                weightAccum,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+
+	enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+	enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+	enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
+
+	opencl_assert(clReleaseMemObject(difference));
+	opencl_assert(clReleaseMemObject(blurDifference));
+
+	kernel_set_args(ckNLMNormalize, 0,
+	                out_mem, weightAccum, w, h, stride);
+	enqueue_kernel(ckNLMNormalize, w, h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+	cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+	kernel_set_args(ckFilterConstructTransform, 0,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                task->filter_area,
+	                task->rect,
+	                task->buffer.pass_stride,
+	                task->radius,
+	                task->pca_threshold);
+
+	enqueue_kernel(ckFilterConstructTransform,
+	               task->storage.w,
+	               task->storage.h,
+	               256);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+                                             device_ptr color_variance_ptr,
+                                             device_ptr output_ptr,
+                                             DenoisingTask *task)
+{
+	mem_zero(task->storage.XtWX);
+	mem_zero(task->storage.XtWY);
+
+	cl_mem color_mem = CL_MEM_PTR(color_ptr);
+	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+	cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+	cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+	cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+	cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
+
+	int w = task->reconstruction_state.source_w;
+	int h = task->reconstruction_state.source_h;
+	int stride = task->buffer.stride;
+
+	int shift_stride = stride*h;
+	int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
+	int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+	cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+	cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+
+	kernel_set_args(ckNLMCalcDifference, 0,
+	                color_mem,
+	                color_variance_mem,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius,
+	                task->buffer.pass_stride,
+	                1.0f, task->nlm_k_2);
+	kernel_set_args(ckNLMBlur, 0,
+	                difference,
+	                blurDifference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4);
+	kernel_set_args(ckNLMCalcWeight, 0,
+	                blurDifference,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4);
+	kernel_set_args(ckNLMConstructGramian, 0,
+	                blurDifference,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->reconstruction_state.filter_window,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4,
+	                task->buffer.pass_stride);
+
+	enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+	enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+	enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+
+	opencl_assert(clReleaseMemObject(difference));
+	opencl_assert(clReleaseMemObject(blurDifference));
+
+	kernel_set_args(ckFinalize, 0,
+	                output_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->filter_area,
+	                task->reconstruction_state.buffer_params,
+	                task->render_buffer.samples);
+	enqueue_kernel(ckFinalize, w, h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+                                                device_ptr b_ptr,
+                                                device_ptr mean_ptr,
+                                                device_ptr variance_ptr,
+                                                int r, int4 rect,
+                                                DenoisingTask *task)
+{
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+	kernel_set_args(ckFilterCombineHalves, 0,
+	                mean_mem,
+	                variance_mem,
+	                a_mem,
+	                b_mem,
+	                rect,
+	                r);
+	enqueue_kernel(ckFilterCombineHalves,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+                                               device_ptr b_ptr,
+                                               device_ptr sample_variance_ptr,
+                                               device_ptr sv_variance_ptr,
+                                               device_ptr buffer_variance_ptr,
+                                               DenoisingTask *task)
+{
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+	cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+	cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+	kernel_set_args(ckFilterDivideShadow, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                a_mem,
+	                b_mem,
+	                sample_variance_mem,
+	                sv_variance_mem,
+	                buffer_variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset);
+	enqueue_kernel(ckFilterDivideShadow,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+                                             int variance_offset,
+                                             device_ptr mean_ptr,
+                                             device_ptr variance_ptr,
+                                             DenoisingTask *task)
+{
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+	kernel_set_args(ckFilterGetFeature, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                mean_offset,
+	                variance_offset,
+	                mean_mem,
+	                variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset);
+	enqueue_kernel(ckFilterGetFeature,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr depth_ptr,
+                                                 device_ptr output_ptr,
+                                                 DenoisingTask *task)
+{
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+	kernel_set_args(ckFilterDetectOutliers, 0,
+	                image_mem,
+	                variance_mem,
+	                depth_mem,
+	                output_mem,
+	                task->rect,
+	                task->buffer.pass_stride);
+	enqueue_kernel(ckFilterDetectOutliers,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+                                           DenoisingTask *task)
+{
+	task->tiles_mem.copy_to_device();
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+	kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+	for(int i = 0; i < 9; i++) {
+		cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+		kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+	}
+
+	enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+	return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
+{
+	denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+	denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+	denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+	denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+	denoising.render_buffer.samples = rtile.sample;
+
+	RenderTile rtiles[9];
+	rtiles[4] = rtile;
+	task.map_neighbor_tiles(rtiles, this);
+	denoising.tiles_from_rendertiles(rtiles);
+
+	denoising.init_from_devicetask(task);
+
+	denoising.run_denoising();
+
+	task.unmap_neighbor_tiles(rtiles, this);
+}
+
 void OpenCLDeviceBase::shader(DeviceTask& task)
 {
 	/* cast arguments to cl types */
 	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
 	cl_mem d_input = CL_MEM_PTR(task.shader_input);
 	cl_mem d_output = CL_MEM_PTR(task.shader_output);
-	cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
 	cl_int d_shader_eval_type = task.shader_eval_type;
 	cl_int d_shader_filter = task.shader_filter;
 	cl_int d_shader_x = task.shader_x;
@@ -491,10 +1116,15 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 
 	cl_kernel kernel;
 
-	if(task.shader_eval_type >= SHADER_EVAL_BAKE)
+	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
 		kernel = base_program(ustring("bake"));
-	else
-		kernel = base_program(ustring("shader"));
+	}
+	else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+		kernel = base_program(ustring("displace"));
+	}
+	else {
+		kernel = base_program(ustring("background"));
+	}
 
 	cl_uint start_arg_index =
 		kernel_set_args(kernel,
@@ -503,16 +1133,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 		                d_input,
 		                d_output);
 
-	if(task.shader_eval_type < SHADER_EVAL_BAKE) {
-		start_arg_index += kernel_set_args(kernel,
-		                                   start_arg_index,
-		                                   d_output_luma);
-	}
-
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(kernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(kernel,
 	                                   start_arg_index,
@@ -545,7 +1166,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 
 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
 {
-	string build_options = "-cl-fast-relaxed-math ";
+	string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
 
 	if(platform_name == "NVIDIA CUDA") {
 		build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
@@ -725,7 +1346,7 @@ void OpenCLDeviceBase::store_cached_kernel(
 }
 
 string OpenCLDeviceBase::build_options_for_base_program(
-        const DeviceRequestedFeatures& /*requested_features*/)
+        const DeviceRequestedFeatures& requested_features)
 {
 	/* TODO(sergey): By default we compile all features, meaning
 	 * mega kernel is not getting feature-based optimizations.
@@ -733,6 +1354,14 @@ string OpenCLDeviceBase::build_options_for_base_program(
 	 * Ideally we need always compile kernel with as less features
 	 * enabled as possible to keep performance at it's max.
 	 */
+
+	/* For now disable baking when not in use as this has major
+	 * impact on kernel build times.
+	 */
+	if(!requested_features.use_baking) {
+		return "-D__NO_BAKING__";
+	}
+
 	return "";
 }
 
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e022..ef39cfb5f7d 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -16,15 +16,15 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,11 +43,12 @@ public:
 		return true;
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
+		return true;
 	}
 
 	~OpenCLDeviceMegaKernel()
@@ -58,10 +59,11 @@ public:
 
 	void path_trace(RenderTile& rtile, int sample)
 	{
+		scoped_timer timer(&rtile.buffers->render_time);
+
 		/* Cast arguments to cl types. */
 		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
 		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
 		cl_int d_x = rtile.x;
 		cl_int d_y = rtile.y;
 		cl_int d_w = rtile.w;
@@ -78,13 +80,9 @@ public:
 			kernel_set_args(ckPathTraceKernel,
 			                0,
 			                d_data,
-			                d_buffer,
-			                d_rng_state);
+			                d_buffer);
 
-#define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
+		set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index);
 
 		start_arg_index += kernel_set_args(ckPathTraceKernel,
 		                                   start_arg_index,
@@ -107,41 +105,55 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
+			DenoisingTask denoising(this);
+
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+				if(tile.task == RenderTile::PATH_TRACE) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
+
+						path_trace(tile, sample);
 
-					path_trace(tile, sample);
+						tile.sample = sample + 1;
 
-					tile.sample = sample + 1;
+						task->update_progress(&tile, tile.w*tile.h);
+					}
 
+					/* Complete kernel execution before release tile */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, denoising, *task);
 					task->update_progress(&tile, tile.w*tile.h);
 				}
 
-				/* Complete kernel execution before release tile */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
 				task->release_tile(tile);
 			}
 		}
 	}
+
+	bool is_split_kernel()
+	{
+		return false;
+	}
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 3c3c2150128..51d3c7bb10f 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -16,1290 +16,441 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "device/device_split_kernel.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
+class OpenCLSplitKernel;
+
+namespace {
+
+/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+ * fetch its size.
  */
-class SplitRenderTile : public RenderTile {
-public:
-	SplitRenderTile()
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0) {}
-
-	explicit SplitRenderTile(RenderTile& tile)
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0)
-	{
-		x = tile.x;
-		y = tile.y;
-		w = tile.w;
-		h = tile.h;
-		start_sample = tile.start_sample;
-		num_samples = tile.num_samples;
-		sample = tile.sample;
-		resolution = tile.resolution;
-		offset = tile.offset;
-		stride = tile.stride;
-		buffer = tile.buffer;
-		rng_state = tile.rng_state;
-		buffers = tile.buffers;
+typedef struct KernelGlobalsDummy {
+	ccl_constant KernelData *data;
+	ccl_global char *buffers[8];
+
+#define KERNEL_TEX(type, name) \
+	TextureInfo name;
+#  include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+	SplitData split_data;
+	SplitParams split_param_data;
+} KernelGlobalsDummy;
+
+}  // namespace
+
+static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
+{
+	string build_options = "-D__SPLIT_KERNEL__ ";
+	build_options += requested_features.get_build_options();
+
+	/* Set compute device build option. */
+	cl_device_type device_type;
+	OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
+	assert(device->ciErr == CL_SUCCESS);
+	if(device_type == CL_DEVICE_TYPE_GPU) {
+		build_options += " -D__COMPUTE_DEVICE_GPU__";
 	}
 
-	/* Split kernel is device global memory constrained;
-	 * hence split kernel cant render big tile size's in
-	 * one go. If the user sets a big tile size (big tile size
-	 * is a term relative to the available device global memory),
-	 * we split the tile further and then call path_trace on
-	 * each of those split tiles. The following variables declared,
-	 * assist in achieving that purpose
-	 */
-	int buffer_offset_x;
-	int buffer_offset_y;
-	int rng_state_offset_x;
-	int rng_state_offset_y;
-	int buffer_rng_state_stride;
-};
+	return build_options;
+}
 
 /* OpenCLDeviceSplitKernel's declaration/definition. */
 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
 {
 public:
-	/* Kernel declaration. */
+	DeviceSplitKernel *split_kernel;
 	OpenCLProgram program_data_init;
-	OpenCLProgram program_scene_intersect;
-	OpenCLProgram program_lamp_emission;
-	OpenCLProgram program_queue_enqueue;
-	OpenCLProgram program_background_buffer_update;
-	OpenCLProgram program_shader_eval;
-	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-	OpenCLProgram program_direct_lighting;
-	OpenCLProgram program_shadow_blocked;
-	OpenCLProgram program_next_iteration_setup;
-	OpenCLProgram program_sum_all_radiance;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
-	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
-	/* Number of path-iterations to be done in one shot. */
-	unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
-	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
-
-	/* Denotes the maximum work groups possible w.r.t. current tile size. */
-	unsigned int max_work_groups;
-#endif
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
+	OpenCLProgram program_state_buffer_size;
+
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
+
+	~OpenCLDeviceSplitKernel()
 	{
-		background = background_;
-
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
-		ray_state = NULL;
-
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
-		/* Queue. */
-		Queue_data = NULL;
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
-		per_sample_output_buffers = NULL;
-
-		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
-		PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
-		max_work_groups = 0;
-#endif
-		current_max_closure = -1;
-		first_tile = true;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
+		task_pool.stop();
+
+		/* Release kernels */
+		program_data_init.release();
+
+		delete split_kernel;
 	}
 
 	virtual bool show_samples() const {
-		return false;
+		return true;
 	}
 
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
+		bool single_program = OpenCLInfo::use_single_program();
+		program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_data_init",
+		                                  single_program ? "kernel_split.cl" : "kernel_data_init.cl",
+		                                  get_build_options(this, requested_features));
+
+		program_data_init.add_kernel(ustring("path_trace_data_init"));
+		programs.push_back(&program_data_init);
+
+		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_state_buffer_size",
+		                                  single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
+		                                  get_build_options(this, requested_features));
+		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+		programs.push_back(&program_state_buffer_size);
+
+		return split_kernel->load_kernels(requested_features);
+	}
+
+	void thread_run(DeviceTask *task)
+	{
+		flush_texture_buffers();
+
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::RENDER) {
+			RenderTile tile;
+			DenoisingTask denoising(this);
+
+			/* Allocate buffer for kernel globals */
+			device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
+			kgbuffer.alloc_to_device(1);
+
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					assert(tile.task == RenderTile::PATH_TRACE);
+					scoped_timer timer(&tile.buffers->render_time);
+
+					split_kernel->path_trace(task,
+					                         tile,
+					                         kgbuffer,
+					                         *const_mem_map["__data"]);
+
+					/* Complete kernel execution before release tile. */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, denoising, *task);
+					task->update_progress(&tile, tile.w*tile.h);
+				}
+
+				task->release_tile(tile);
+			}
+
+			kgbuffer.free();
 		}
-		return ret_size;
 	}
 
-	size_t get_shader_data_size(size_t max_closure)
+	bool is_split_kernel()
 	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
+		return true;
 	}
 
-	/* Returns size of KernelGlobals structure associated with OpenCL. */
-	size_t get_KernelGlobals_size()
+protected:
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
 	{
-		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
-		 * fetch its size.
-		 */
-		typedef struct KernelGlobals {
-			ccl_constant KernelData *data;
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-			void *sd_input;
-			void *isect_shadow;
-		} KernelGlobals;
+		return requested_features.get_build_options();
+	}
 
-		return sizeof(KernelGlobals);
+	friend class OpenCLSplitKernel;
+	friend class OpenCLSplitKernelFunction;
+};
+
+struct CachedSplitMemory {
+	int id;
+	device_memory *split_data;
+	device_memory *ray_state;
+	device_memory *queue_index;
+	device_memory *use_queues_flag;
+	device_memory *work_pools;
+	device_ptr *buffer;
+};
+
+class OpenCLSplitKernelFunction : public SplitKernelFunction {
+public:
+	OpenCLDeviceSplitKernel* device;
+	OpenCLDeviceBase::OpenCLProgram program;
+	CachedSplitMemory& cached_memory;
+	int cached_id;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+			device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+	{
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
-	                          vector<OpenCLProgram*> &programs)
+	~OpenCLSplitKernelFunction()
 	{
-		string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
-		build_options += "-D__WORK_STEALING__ ";
-#endif
-		build_options += requested_features.get_build_options();
-
-		/* Set compute device build option. */
-		cl_device_type device_type;
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_TYPE,
-		                        sizeof(cl_device_type),
-		                        &device_type,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(device_type == CL_DEVICE_TYPE_GPU) {
-			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		program.release();
+	}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
+	{
+		if(cached_id != cached_memory.id) {
+			cl_uint start_arg_index =
+				device->kernel_set_args(program(),
+					            0,
+					            kg,
+					            data,
+					            *cached_memory.split_data,
+					            *cached_memory.ray_state);
+
+				device->set_kernel_arg_buffers(program(), &start_arg_index);
+
+			start_arg_index +=
+				device->kernel_set_args(program(),
+					            start_arg_index,
+					            *cached_memory.queue_index,
+					            *cached_memory.use_queues_flag,
+					            *cached_memory.work_pools,
+					            *cached_memory.buffer);
+
+			cached_id = cached_memory.id;
+		}
+
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                                       program(),
+		                                       2,
+		                                       NULL,
+		                                       dim.global_size,
+		                                       dim.local_size,
+		                                       0,
+		                                       NULL,
+		                                       NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return false;
 		}
 
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
-	do { \
-		GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
-		GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
-		programs.push_back(&GLUE(program_, name)); \
-	} while(false)
-
-		LOAD_KERNEL(data_init);
-		LOAD_KERNEL(scene_intersect);
-		LOAD_KERNEL(lamp_emission);
-		LOAD_KERNEL(queue_enqueue);
-		LOAD_KERNEL(background_buffer_update);
-		LOAD_KERNEL(shader_eval);
-		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		LOAD_KERNEL(direct_lighting);
-		LOAD_KERNEL(shadow_blocked);
-		LOAD_KERNEL(next_iteration_setup);
-		LOAD_KERNEL(sum_all_radiance);
-
-#undef FIND_KERNEL
-#undef GLUE
-
-		current_max_closure = requested_features.max_closure;
+		return true;
 	}
+};
 
-	~OpenCLDeviceSplitKernel()
+class OpenCLSplitKernel : public DeviceSplitKernel {
+	OpenCLDeviceSplitKernel *device;
+	CachedSplitMemory cached_memory;
+public:
+	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
+	}
+
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		task_pool.stop();
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
 
-		/* Release kernels */
-		program_data_init.release();
-		program_scene_intersect.release();
-		program_lamp_emission.release();
-		program_queue_enqueue.release();
-		program_background_buffer_update.release();
-		program_shader_eval.release();
-		program_holdout_emission_blurring_pathtermination_ao.release();
-		program_direct_lighting.release();
-		program_shadow_blocked.release();
-		program_next_iteration_setup.release();
-		program_sum_all_radiance.release();
-
-		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
-		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
-#endif
-		release_mem_object_safe(per_sample_output_buffers);
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
+		bool single_program = OpenCLInfo::use_single_program();
+		kernel->program =
+			OpenCLDeviceBase::OpenCLProgram(device,
+			                                single_program ? "split" : "split_" + kernel_name,
+			                                single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
+			                                get_build_options(device, requested_features));
+
+		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+		kernel->program.load();
+
+		if(!kernel->program.is_loaded()) {
+			delete kernel;
+			return NULL;
 		}
+
+		return kernel;
 	}
 
-	void path_trace(DeviceTask *task,
-	                SplitRenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
 	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Make sure that set render feasible tile size is a multiple of local
-		 * work size dimensions.
-		 */
-		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+		device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+		size_buffer.alloc(1);
+		size_buffer.zero_to_device();
+
+		uint threads = num_threads;
+		device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+		size_t global_size = 64;
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_state_buffer_size(),
+		                               1,
+		                               NULL,
+		                               &global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		size_buffer.copy_from_device(0, 1, 1);
+		size_t size = size_buffer[0];
+		size_buffer.free();
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return 0;
+		}
+
+		return size;
+	}
 
-		size_t global_size[2];
-		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
-		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs
+	                                            )
+	{
+		cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
 
 		/* Set the range of samples to be processed for every ray in
 		 * path-regeneration logic.
 		 */
 		cl_int start_sample = rtile.start_sample;
 		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_samples = 1;
-#else
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_threads = max_render_feasible_tile_size.x *
-		                           max_render_feasible_tile_size.y;
-		unsigned int num_tile_columns_possible = num_threads / global_size[1];
-		/* Estimate number of parallel samples that can be
-		 * processed in parallel.
-		 */
-		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-		                                        rtile.num_samples);
-		/* Wavefront size in AMD is 64.
-		 * TODO(sergey): What about other platforms?
-		 */
-		if(num_parallel_samples >= 64) {
-			/* TODO(sergey): Could use generic round-up here. */
-			num_parallel_samples = (num_parallel_samples / 64) * 64;
-		}
-		assert(num_parallel_samples != 0);
-
-		global_size[0] = d_w * num_parallel_samples;
-#endif  /* __WORK_STEALING__ */
-
-		assert(global_size[0] * global_size[1] <=
-		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
-		/* Allocate all required global memory once. */
-		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
-			/* Calculate max groups */
-			size_t max_global_size[2];
-			size_t tile_x = max_render_feasible_tile_size.x;
-			size_t tile_y = max_render_feasible_tile_size.y;
-			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
-			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
-			max_work_groups = (max_global_size[0] * max_global_size[1]) /
-			                  (local_size[0] * local_size[1]);
-			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif  /* __WORK_STEALING__ */
-
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
-		}
-
-		cl_int dQueue_size = global_size[0] * global_size[1];
 
 		cl_uint start_arg_index =
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                0,
-			                kgbuffer,
-			                sd_DL_shadow,
-			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
+			                kernel_globals,
+			                kernel_data,
+			                split_data,
+			                num_global_elements,
 			                ray_state);
 
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
+			device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
 
 		start_arg_index +=
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                start_arg_index,
 			                start_sample,
-			                d_x,
-			                d_y,
-			                d_w,
-			                d_h,
-			                d_offset,
-			                d_stride,
-			                rtile.rng_state_offset_x,
-			                rtile.rng_state_offset_y,
-			                rtile.buffer_rng_state_stride,
-			                Queue_data,
-			                Queue_index,
+			                end_sample,
+			                rtile.x,
+			                rtile.y,
+			                rtile.w,
+			                rtile.h,
+			                rtile.offset,
+			                rtile.stride,
+			                queue_index,
 			                dQueue_size,
 			                use_queues_flag,
-			                work_array,
-#ifdef __WORK_STEALING__
 			                work_pool_wgs,
-			                num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
-			                num_parallel_samples);
-
-		kernel_set_args(program_scene_intersect(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_lamp_emission(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-		                num_parallel_samples);
-
-		kernel_set_args(program_queue_enqueue(),
-		                0,
-		                Queue_data,
-		                Queue_index,
-		                ray_state,
-		                dQueue_size);
-
-		kernel_set_args(program_background_buffer_update(),
-		                 0,
-		                 kgbuffer,
-		                 d_data,
-		                 per_sample_output_buffers,
-		                 d_rng_state,
-		                 rng_coop,
-		                 throughput_coop,
-		                 PathRadiance_coop,
-		                 Ray_coop,
-		                 PathState_coop,
-		                 L_transparent_coop,
-		                 ray_state,
-		                 d_w,
-		                 d_h,
-		                 d_x,
-		                 d_y,
-		                 d_stride,
-		                 rtile.rng_state_offset_x,
-		                 rtile.rng_state_offset_y,
-		                 rtile.buffer_rng_state_stride,
-		                 work_array,
-		                 Queue_data,
-		                 Queue_index,
-		                 dQueue_size,
-		                 end_sample,
-		                 start_sample,
-#ifdef __WORK_STEALING__
-		                 work_pool_wgs,
-		                 num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-		                 debugdata_coop,
-#endif
-		                 num_parallel_samples);
-
-		kernel_set_args(program_shader_eval(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                per_sample_output_buffers,
-		                rng_coop,
-		                throughput_coop,
-		                L_transparent_coop,
-		                PathRadiance_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                AOAlpha_coop,
-		                AOBSDF_coop,
-		                AOLightRay_coop,
-		                d_w,
-		                d_h,
-		                d_x,
-		                d_y,
-		                d_stride,
-		                ray_state,
-		                work_array,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-#ifdef __WORK_STEALING__
-		                start_sample,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_direct_lighting(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                PathState_coop,
-		                ISLamp_coop,
-		                LightRay_coop,
-		                BSDFEval_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_shadow_blocked(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                PathState_coop,
-		                LightRay_coop,
-		                AOLightRay_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_next_iteration_setup(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                LightRay_coop,
-		                ISLamp_coop,
-		                BSDFEval_coop,
-		                AOLightRay_coop,
-		                AOBSDF_coop,
-		                AOAlpha_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag);
-
-		kernel_set_args(program_sum_all_radiance(),
-		                0,
-		                d_data,
-		                d_buffer,
-		                per_sample_output_buffers,
-		                num_parallel_samples,
-		                d_w,
-		                d_h,
-		                d_stride,
-		                rtile.buffer_offset_x,
-		                rtile.buffer_offset_y,
-		                rtile.buffer_rng_state_stride,
-		                start_sample);
-
-		/* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
-		{ \
-			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
-			                               GLUE(program_, \
-			                                    kernelName)(), \
-			                               2, \
-			                               NULL, \
-			                               globalSize, \
-			                               localSize, \
-			                               0, \
-			                               NULL, \
-			                               NULL); \
-			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
-			if(ciErr != CL_SUCCESS) { \
-				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
-				                               clewErrorString(ciErr)); \
-				opencl_error(message); \
-				return; \
-			} \
-		} (void) 0
+			                rtile.num_samples,
+			                rtile.buffer);
 
 		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
-		bool activeRaysAvailable = true;
-
-		/* Record number of time host intervention has been made */
-		unsigned int numHostIntervention = 0;
-		unsigned int numNextPathIterTimes = PathIteration_times;
-		bool canceled = false;
-		while(activeRaysAvailable) {
-			/* Twice the global work size of other kernels for
-			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
-			size_t global_size_shadow_blocked[2];
-			global_size_shadow_blocked[0] = global_size[0] * 2;
-			global_size_shadow_blocked[1] = global_size[1];
-
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-
-				if(task->get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
-
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0;
-			    rayStateIter < global_size[0] * global_size[1];
-			    ++rayStateIter)
-			{
-				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(activeRaysAvailable) {
-				numHostIntervention++;
-				PathIteration_times = PATH_ITER_INC_FACTOR;
-				/* Host intervention done before all rays become RAY_INACTIVE;
-				 * Set do more initial iterations for the next tile.
-				 */
-				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
-			}
-
-			if(task->get_cancel()) {
-				canceled = true;
-				break;
-			}
-		}
-
-		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
-		 * per_sample_output_buffers into RenderTile's output buffer.
-		 */
-		if(!canceled) {
-			size_t sum_all_radiance_local_size[2] = {16, 16};
-			size_t sum_all_radiance_global_size[2];
-			sum_all_radiance_global_size[0] =
-				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
-				sum_all_radiance_local_size[0];
-			sum_all_radiance_global_size[1] =
-				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
-				sum_all_radiance_local_size[1];
-			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-			                     sum_all_radiance_global_size,
-			                     sum_all_radiance_local_size);
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_data_init(),
+		                               2,
+		                               NULL,
+		                               dim.global_size,
+		                               dim.local_size,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return false;
 		}
 
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
+		cached_memory.split_data = &split_data;
+		cached_memory.ray_state = &ray_state;
+		cached_memory.queue_index = &queue_index;
+		cached_memory.use_queues_flag = &use_queues_flag;
+		cached_memory.work_pools = &work_pool_wgs;
+		cached_memory.buffer = &rtile.buffer;
+		cached_memory.id++;
 
-		if(numHostIntervention == 0) {
-			/* This means that we are executing kernel more than required
-			 * Must avoid this for the next sample/tile.
-			 */
-			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
-			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
-		}
-		else {
-			/* Number of path-iterations done for this tile is set as
-			 * Initial path-iteration times for the next tile
-			 */
-			PathIteration_times = numNextPathIterTimes;
-		}
-
-		first_tile = false;
+		return true;
 	}
 
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
+	virtual int2 split_kernel_local_size()
 	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = get_KernelGlobals_size();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
+		return make_int2(64, 1);
 	}
 
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
 	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-		size_t max_num_work_pools = 0;
-		max_global_size[0] =
-			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		max_global_size[1] =
-			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		max_num_work_pools =
-			(max_global_size[0] * max_global_size[1]) /
-			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
-		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
-		return tile_specific_mem_allocated;
-	}
-
-	/* Calculates the texture memories and KernelData (d_data) memory
-	 * that has been allocated.
-	 */
-	size_t get_scene_specific_mem_allocated(cl_mem d_data)
-	{
-		size_t scene_specific_mem_allocated = 0;
-		/* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
-	scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-		size_t d_data_size;
-		ciErr = clGetMemObjectInfo(d_data,
-		                           CL_MEM_SIZE,
-		                           sizeof(d_data_size),
-		                           &d_data_size,
-		                           NULL);
-		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
-		scene_specific_mem_allocated += d_data_size;
-		return scene_specific_mem_allocated;
-	}
-
-	/* Calculate the memory required for one thread in split kernel. */
-	size_t get_per_thread_memory()
-	{
-		size_t shaderdata_size = 0;
-		/* TODO(sergey): This will actually over-allocate if
-		 * particular kernel does not support multiclosure.
-		 */
-		shaderdata_size = get_shader_data_size(current_max_closure);
-		size_t retval = sizeof(RNG)
-			+ sizeof(float3)          /* Throughput size */
-			+ sizeof(float)           /* L transparent size */
-			+ sizeof(char)            /* Ray state size */
-			+ sizeof(unsigned int)    /* Work element size */
-			+ sizeof(int)             /* ISLamp_size */
-			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
-			+ sizeof(Intersection)    /* Overall isect */
-			+ sizeof(Intersection)    /* Instersection_coop_AO */
-			+ sizeof(Intersection)    /* Intersection coop DL */
-			+ shaderdata_size         /* Overall ShaderData */
-			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
-			+ sizeof(Ray) + sizeof(BsdfEval)
-			+ sizeof(float3)          /* AOAlpha size */
-			+ sizeof(float3)          /* AOBSDF size */
-			+ sizeof(Ray)
-			+ (sizeof(int) * NUM_QUEUES)
-			+ per_thread_output_buffer_size;
-		return retval;
-	}
-
-	/* Considers the total memory available in the device and
-	 * and returns the maximum global work size possible.
-	 */
-	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
-	{
-		/* Calculate invariably allocated memory. */
-		size_t invariable_mem_allocated = get_invariable_mem_allocated();
-		/* Calculate tile specific allocated memory. */
-		size_t tile_specific_mem_allocated =
-			get_tile_specific_mem_allocated(tile_size);
-		/* Calculate scene specific allocated memory. */
-		size_t scene_specific_mem_allocated =
-			get_scene_specific_mem_allocated(d_data);
-		/* Calculate total memory available for the threads in global work size. */
-		size_t available_memory = total_allocatable_memory
-			- invariable_mem_allocated
-			- tile_specific_mem_allocated
-			- scene_specific_mem_allocated
-			- DATA_ALLOCATION_MEM_FACTOR;
-		size_t per_thread_memory_required = get_per_thread_memory();
-		return (available_memory / per_thread_memory_required);
-	}
-
-	/* Checks if the device has enough memory to render the whole tile;
-	 * If not, we should split single tile into multiple tiles of small size
-	 * and process them all.
-	 */
-	bool need_to_split_tile(unsigned int d_w,
-	                        unsigned int d_h,
-	                        int2 max_render_feasible_tile_size)
-	{
-		size_t global_size_estimate[2];
-		/* TODO(sergey): Such round-ups are in quite few places, need to replace
-		 * them with an utility macro.
-		 */
-		global_size_estimate[0] =
-			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		global_size_estimate[1] =
-			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if((global_size_estimate[0] * global_size_estimate[1]) >
-		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
-		{
-			return true;
+		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
+		/* Use small global size on CPU devices as it seems to be much faster. */
+		if(type == CL_DEVICE_TYPE_CPU) {
+			VLOG(1) << "Global size: (64, 64).";
+			return make_int2(64, 64);
 		}
-		else {
-			return false;
-		}
-	}
 
-	/* Considers the scene properties, global memory available in the device
-	 * and returns a rectanglular tile dimension (approx the maximum)
-	 * that should render on split kernel.
-	 */
-	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
-	{
-		int2 max_render_feasible_tile_size;
-		int square_root_val = (int)sqrt(feasible_global_work_size);
-		max_render_feasible_tile_size.x = square_root_val;
-		max_render_feasible_tile_size.y = square_root_val;
-		/* Ciel round-off max_render_feasible_tile_size. */
-		int2 ceil_render_feasible_tile_size;
-		ceil_render_feasible_tile_size.x =
-			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		ceil_render_feasible_tile_size.y =
-			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
-		   feasible_global_work_size)
-		{
-			return ceil_render_feasible_tile_size;
-		}
-		/* Floor round-off max_render_feasible_tile_size. */
-		int2 floor_render_feasible_tile_size;
-		floor_render_feasible_tile_size.x =
-			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		floor_render_feasible_tile_size.y =
-			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		return floor_render_feasible_tile_size;
-	}
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
 
-	/* Try splitting the current tile into multiple smaller
-	 * almost-square-tiles.
-	 */
-	int2 get_split_tile_size(RenderTile rtile,
-	                         int2 max_render_feasible_tile_size)
-	{
-		int2 split_tile_size;
-		int num_global_threads = max_render_feasible_tile_size.x *
-		                         max_render_feasible_tile_size.y;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		/* Ceil round off d_w and d_h */
-		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		while(d_w * d_h > num_global_threads) {
-			/* Halve the longer dimension. */
-			if(d_w >= d_h) {
-				d_w = d_w / 2;
-				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_X;
-			}
-			else {
-				d_h = d_h / 2;
-				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_Y;
-			}
+		if(DebugFlags().opencl.mem_limit) {
+			max_buffer_size = min(max_buffer_size,
+			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
 		}
-		split_tile_size.x = d_w;
-		split_tile_size.y = d_h;
-		return split_tile_size;
-	}
 
-	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
-	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
-	{
-		vector<SplitRenderTile> to_path_trace_rtile;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
-		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
-		/* Buffer and rng_state offset calc. */
-		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
-		size_t offset_x = offset_index % rtile.stride;
-		size_t offset_y = offset_index / rtile.stride;
-		/* Resize to_path_trace_rtile. */
-		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
-		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
-			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
-				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
-				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
-				to_path_trace_rtile[rtile_index].sample = rtile.sample;
-				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
-				to_path_trace_rtile[rtile_index].offset = rtile.offset;
-				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
-				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
-				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
-				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
-				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
-				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
-				/* Fill width and height of the new render tile. */
-				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
-					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
-					: split_tile_size.x;
-				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
-					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
-					: split_tile_size.y;
-				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
-			}
-		}
-		return to_path_trace_rtile;
-	}
-
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			bool initialize_data_and_check_render_feasibility = false;
-			bool need_to_split_tiles_further = false;
-			int2 max_render_feasible_tile_size;
-			size_t feasible_global_work_size;
-			const int2 tile_size = task->requested_tile_size;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				if(!initialize_data_and_check_render_feasibility) {
-					/* Initialize data. */
-					/* Calculate per_thread_output_buffer_size. */
-					size_t output_buffer_size = 0;
-					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
-					                           CL_MEM_SIZE,
-					                           sizeof(output_buffer_size),
-					                           &output_buffer_size,
-					                           NULL);
-					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
-					/* This value is different when running on AMD and NV. */
-					if(background) {
-						/* In offline render the number of buffer elements
-						 * associated with tile.buffer is the current tile size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.w * tile.h);
-					}
-					else {
-						/* interactive rendering, unlike offline render, the number of buffer elements
-						 * associated with tile.buffer is the entire viewport size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.buffers->params.width *
-							                      tile.buffers->params.height);
-					}
-					/* Check render feasibility. */
-					feasible_global_work_size = get_feasible_global_work_size(
-						tile_size,
-						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
-					max_render_feasible_tile_size =
-						get_max_render_feasible_tile_size(
-							feasible_global_work_size);
-					need_to_split_tiles_further =
-						need_to_split_tile(tile_size.x,
-						                   tile_size.y,
-						                   max_render_feasible_tile_size);
-					initialize_data_and_check_render_feasibility = true;
-				}
-				if(need_to_split_tiles_further) {
-					int2 split_tile_size =
-						get_split_tile_size(tile,
-						                    max_render_feasible_tile_size);
-					vector<SplitRenderTile> to_path_trace_render_tiles =
-						split_tiles(tile, split_tile_size);
-					/* Print message to console */
-					if(background && (to_path_trace_render_tiles.size() > 1)) {
-						fprintf(stderr, "Message : Tiles need to be split "
-						        "further inside path trace (due to insufficient "
-						        "device-global-memory for split kernel to "
-						        "function) \n"
-						        "The current tile of dimensions %dx%d is split "
-						        "into tiles of dimension %dx%d for render \n",
-						        tile.w, tile.h,
-						        split_tile_size.x,
-						        split_tile_size.y);
-					}
-					/* Process all split tiles. */
-					for(int tile_iter = 0;
-					    tile_iter < to_path_trace_render_tiles.size();
-					    ++tile_iter)
-					{
-						path_trace(task,
-						           to_path_trace_render_tiles[tile_iter],
-						           max_render_feasible_tile_size);
-					}
-				}
-				else {
-					/* No splitting required; process the entire tile at once. */
-					/* Render feasible tile size is user-set-tile-size itself. */
-					max_render_feasible_tile_size.x =
-						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_X;
-					max_render_feasible_tile_size.y =
-						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_Y;
-					/* buffer_rng_state_stride is stride itself. */
-					SplitRenderTile split_tile(tile);
-					split_tile.buffer_rng_state_stride = tile.stride;
-					path_trace(task, split_tile, max_render_feasible_tile_size);
-				}
-				tile.sample = tile.start_sample + tile.num_samples;
-
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
+		VLOG(1) << "Maximum device allocation size: "
+		        << string_human_readable_number(max_buffer_size) << " bytes. ("
+		        << string_human_readable_size(max_buffer_size) << ").";
 
-				task->release_tile(tile);
-			}
-		}
-	}
+		/* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
+		max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
 
-protected:
-	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
-	{
-		cl_mem ptr;
-		assert(bufsize != 0);
-		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-		return ptr;
+		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
+		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
+		VLOG(1) << "Global size: " << global_size << ".";
+		return global_size;
 	}
+};
 
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
+OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+: OpenCLDeviceBase(info, stats, background_)
+{
+	split_kernel = new OpenCLSplitKernel(this);
 
-	string build_options_for_base_program(
-	        const DeviceRequestedFeatures& requested_features)
-	{
-		return requested_features.get_build_options();
-	}
-};
+	background = background_;
+}
 
 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
 {
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index c7760e075cb..78ed401bff5 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,11 +16,13 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 using std::cerr;
 using std::endl;
@@ -234,15 +236,15 @@ string OpenCLCache::get_kernel_md5()
 	thread_scoped_lock lock(self.kernel_md5_lock);
 
 	if(self.kernel_md5.empty()) {
-		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+		self.kernel_md5 = path_files_md5_hash(path_get("source"));
 	}
 	return self.kernel_md5;
 }
 
 OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
-                                               string program_name,
-                                               string kernel_file,
-                                               string kernel_build_options,
+                                               const string& program_name,
+                                               const string& kernel_file,
+                                               const string& kernel_build_options,
                                                bool use_stdout)
  : device(device),
    program_name(program_name),
@@ -273,20 +275,21 @@ void OpenCLDeviceBase::OpenCLProgram::release()
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug)
 {
 	if(!use_stdout) {
 		log += msg + "\n";
 	}
 	else if(!debug) {
 		printf("%s\n", msg.c_str());
+		fflush(stdout);
 	}
 	else {
 		VLOG(2) << msg;
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg)
 {
 	if(use_stdout) {
 		fprintf(stderr, "%s\n", msg.c_str());
@@ -338,12 +341,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 
 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
-	source = path_source_replace_includes(source, path_get("kernel"));
+	source = path_source_replace_includes(source, path_get("source"));
+	source += "\n// " + util_md5_string(source) + "\n";
 
 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -354,10 +358,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;
 
 	program = clCreateProgramWithSource(device->cxContext,
-	                                   1,
-	                                   &source_str,
-	                                   &source_len,
-	                                   &ciErr);
+	                                    1,
+	                                    &source_str,
+	                                    &source_len,
+	                                    &ciErr);
 
 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -440,7 +444,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
 
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		/* need to create source to get md5 */
+		string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+		source = path_source_replace_includes(source, path_get("source"));
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";
 
@@ -546,6 +554,11 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }
 
+bool OpenCLInfo::use_single_program()
+{
+	return DebugFlags().opencl.single_program;
+}
+
 bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
@@ -589,14 +602,46 @@ bool OpenCLInfo::device_supported(const string& platform_name,
                                   const cl_device_id device_id)
 {
 	cl_device_type device_type;
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_TYPE,
-	                sizeof(cl_device_type),
-	                &device_type,
-	                NULL);
+	if(!get_device_type(device_id, &device_type)) {
+		return false;
+	}
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return false;
+	}
+
+	int driver_major = 0;
+	int driver_minor = 0;
+	if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
+		return false;
+	}
+	VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
+	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+	 * (aka, it will not be on Intel framework). This isn't supported
+	 * and needs an explicit blacklist.
+	 */
+	if(strstr(device_name.c_str(), "Iris")) {
+		return false;
+	}
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
+		if(driver_major < 2236) {
+			VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+			return false;
+		}
+		const char *blacklist[] = {
+			/* GCN 1 */
+			"Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan",
+			NULL
+		};
+		for(int i = 0; blacklist[i] != NULL; i++) {
+			if(device_name == blacklist[i]) {
+				VLOG(1) << "AMD device " << device_name << " not supported";
+				return false;
+			}
+		}
 		return true;
 	}
 	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
@@ -663,7 +708,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device,
 	return true;
 }
 
-string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
 {
 	if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
 		/* Use cl_amd_device_topology extension. */
@@ -707,39 +752,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		return;
 	}
 
+	cl_int error;
 	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
 	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;
 
-	/* Get devices. */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
-	   num_platforms == 0)
-	{
-		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+	/* Get platforms. */
+	if(!get_platforms(&platform_ids, &error)) {
+		FIRST_VLOG(2) << "Error fetching platforms:"
+		              << string(clewErrorString(error));
 		first_time = false;
 		return;
 	}
-	platform_ids.resize(num_platforms);
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
-		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+	if(platform_ids.size() == 0) {
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
 		first_time = false;
 		return;
 	}
 	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < num_platforms; platform++) {
+	for(int platform = 0; platform < platform_ids.size(); platform++) {
 		cl_platform_id platform_id = platform_ids[platform];
-		char pname[256];
-		if(clGetPlatformInfo(platform_id,
-		                     CL_PLATFORM_NAME,
-		                     sizeof(pname),
-		                     &pname,
-		                     NULL) != CL_SUCCESS)
-		{
+		string platform_name;
+		if(!get_platform_name(platform_id, &platform_name)) {
 			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
 			continue;
 		}
-		string platform_name = pname;
 		FIRST_VLOG(2) << "Enumerating devices for platform "
 		              << platform_name << ".";
 		if(!platform_version_check(platform_id)) {
@@ -747,39 +783,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			              << " due to too old compiler version.";
 			continue;
 		}
-		num_devices = 0;
-		cl_int ciErr;
-		if((ciErr = clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		if(!get_platform_devices(platform_id,
+		                         device_type,
+		                         &device_ids,
+		                         &error))
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			              << ", failed to fetch of devices: "
+			              << string(clewErrorString(error));
 			continue;
 		}
-		device_ids.resize(num_devices);
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  num_devices,
-		                  &device_ids[0],
-		                  NULL) != CL_SUCCESS)
-		{
+		if(device_ids.size() == 0) {
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch devices list.";
+			              << ", it has no devices.";
 			continue;
 		}
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char device_name[1024] = "\0";
-			if(clGetDeviceInfo(device_id,
-			                   CL_DEVICE_NAME,
-			                   sizeof(device_name),
-			                   &device_name,
-			                   NULL) != CL_SUCCESS)
-			{
-				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+		for(int num = 0; num < device_ids.size(); num++) {
+			const cl_device_id device_id = device_ids[num];
+			string device_name;
+			if(!get_device_name(device_id, &device_name, &error)) {
+				FIRST_VLOG(2) << "Failed to fetch device name: "
+				              << string(clewErrorString(error))
+				              << ", ignoring.";
 				continue;
 			}
 			if(!device_version_check(device_id)) {
@@ -791,24 +816,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			   device_supported(platform_name, device_id))
 			{
 				cl_device_type device_type;
-				if(clGetDeviceInfo(device_id,
-				                   CL_DEVICE_TYPE,
-				                   sizeof(cl_device_type),
-				                   &device_type,
-				                   NULL) != CL_SUCCESS)
-				{
+				if(!get_device_type(device_id, &device_type, &error)) {
 					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type.";
+					              << ", failed to fetch device type:"
+					              << string(clewErrorString(error));
 					continue;
 				}
-				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				string readable_device_name =
+				        get_readable_device_name(device_id);
+				if(readable_device_name != device_name) {
+					FIRST_VLOG(2) << "Using more readable device name: "
+					              << readable_device_name;
+				}
+				FIRST_VLOG(2) << "Adding new device "
+				              << readable_device_name << ".";
 				string hardware_id = get_hardware_id(platform_name, device_id);
-				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-				                                               platform_name,
-				                                               device_id,
-				                                               device_type,
-				                                               device_name,
-				                                               hardware_id));
+				usable_devices->push_back(OpenCLPlatformDevice(
+				        platform_id,
+				        platform_name,
+				        device_id,
+				        device_type,
+				        readable_device_name,
+				        hardware_id));
 			}
 			else {
 				FIRST_VLOG(2) << "Ignoring device " << device_name
@@ -819,6 +848,308 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 	first_time = false;
 }
 
+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
+                               cl_int *error)
+{
+	/* Reset from possible previous state. */
+	platform_ids->resize(0);
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms, error)) {
+		return false;
+	}
+	/* Get actual platforms. */
+	cl_int err;
+	platform_ids->resize(num_platforms);
+	if((err = clGetPlatformIDs(num_platforms,
+	                           &platform_ids->at(0),
+	                           NULL)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_platform_id> OpenCLInfo::get_platforms()
+{
+	vector<cl_platform_id> platform_ids;
+	get_platforms(&platform_ids);
+	return platform_ids;
+}
+
+bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
+{
+	cl_int err;
+	if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_platforms = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platforms()
+{
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms)) {
+		return 0;
+	}
+	return num_platforms;
+}
+
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
+                                   string *platform_name)
+{
+	char buffer[256];
+	if(clGetPlatformInfo(platform_id,
+	                     CL_PLATFORM_NAME,
+	                     sizeof(buffer),
+	                     &buffer,
+	                     NULL) != CL_SUCCESS)
+	{
+		*platform_name = "";
+		return false;
+	}
+	*platform_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
+{
+	string platform_name;
+	if(!get_platform_name(platform_id, &platform_name)) {
+		return "";
+	}
+	return platform_name;
+}
+
+bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                          cl_device_type device_type,
+                                          cl_uint *num_devices,
+                                          cl_int *error)
+{
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         0,
+	                         NULL,
+	                         num_devices)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_devices = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                             cl_device_type device_type)
+{
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices))
+	{
+		return 0;
+	}
+	return num_devices;
+}
+
+bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                      cl_device_type device_type,
+                                      vector<cl_device_id> *device_ids,
+                                      cl_int* error)
+{
+	/* Reset from possible previous state. */
+	device_ids->resize(0);
+	/* Get number of devices to pre-allocate memory. */
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices,
+	                             error))
+	{
+		return false;
+	}
+	/* Get actual device list. */
+	device_ids->resize(num_devices);
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         num_devices,
+	                         &device_ids->at(0),
+	                         NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                                      cl_device_type device_type)
+{
+	vector<cl_device_id> devices;
+	get_platform_devices(platform_id, device_type, &devices);
+	return devices;
+}
+
+bool OpenCLInfo::get_device_name(cl_device_id device_id,
+                                 string *device_name,
+                                 cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_NAME,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_name = "";
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	*device_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_device_name(cl_device_id device_id)
+{
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return "";
+	}
+	return device_name;
+}
+
+bool OpenCLInfo::get_device_type(cl_device_id device_id,
+                                 cl_device_type *device_type,
+                                 cl_int* error)
+{
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_TYPE,
+	                          sizeof(cl_device_type),
+	                          device_type,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_type = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
+{
+	cl_device_type device_type;
+	if(!get_device_type(device_id, &device_type)) {
+		return 0;
+	}
+	return device_type;
+}
+
+string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
+{
+	string name = "";
+	char board_name[1024];
+	size_t length = 0;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_BOARD_NAME_AMD,
+	                   sizeof(board_name),
+	                   &board_name,
+	                   &length) == CL_SUCCESS)
+	{
+		if(length != 0 && board_name[0] != '\0') {
+			name = board_name;
+		}
+	}
+
+	/* Fallback to standard device name API. */
+	if(name.empty()) {
+		name = get_device_name(device_id);
+	}
+
+	/* Distinguish from our native CPU device. */
+	if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
+		name += " (OpenCL)";
+	}
+
+	return name;
+}
+
+bool OpenCLInfo::get_driver_version(cl_device_id device_id,
+                                    int *major,
+                                    int *minor,
+                                    cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DRIVER_VERSION,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	if(sscanf(buffer, "%d.%d", major, minor) < 2) {
+		VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+		return false;
+	}
+	return true;
+}
+
+int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
+{
+	int base_align_bits;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+	                   sizeof(int),
+	                   &base_align_bits,
+	                   NULL) == CL_SUCCESS)
+	{
+		return base_align_bits/8;
+	}
+	return 1;
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt
index 4ea18728f1c..168ca0210e7 100644
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -1,7 +1,6 @@
 
 set(INC
-	.
-	../util
+	..
 )
 
 set(SRC
@@ -20,5 +19,5 @@ set(SRC_HEADERS
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_graph ${SRC} ${SRC_HEADERS})
+cycles_add_library(cycles_graph ${SRC} ${SRC_HEADERS})
 
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 3c228a716d5..b16c6af5d0e 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "util_foreach.h"
-#include "util_param.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_md5.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -364,29 +365,48 @@ static bool is_array_equal(const Node *node, const Node *other, const SocketType
 	return *a == *b;
 }
 
+template<typename T>
+static bool is_value_equal(const Node *node, const Node *other, const SocketType& socket)
+{
+	const T *a = (const T*)(((char*)node) + socket.struct_offset);
+	const T *b = (const T*)(((char*)other) + socket.struct_offset);
+	return *a == *b;
+}
+
 bool Node::equals_value(const Node& other, const SocketType& socket) const
 {
-	if(socket.is_array()) {
-		switch(socket.type) {
-			case SocketType::BOOLEAN_ARRAY: return is_array_equal<bool>(this, &other, socket);
-			case SocketType::FLOAT_ARRAY: return is_array_equal<float>(this, &other, socket);
-			case SocketType::INT_ARRAY: return is_array_equal<int>(this, &other, socket);
-			case SocketType::COLOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
-			case SocketType::VECTOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
-			case SocketType::POINT_ARRAY: return is_array_equal<float3>(this, &other, socket);
-			case SocketType::NORMAL_ARRAY: return is_array_equal<float3>(this, &other, socket);
-			case SocketType::POINT2_ARRAY: return is_array_equal<float2>(this, &other, socket);
-			case SocketType::STRING_ARRAY: return is_array_equal<ustring>(this, &other, socket);
-			case SocketType::TRANSFORM_ARRAY: return is_array_equal<Transform>(this, &other, socket);
-			case SocketType::NODE_ARRAY: return is_array_equal<void*>(this, &other, socket);
-			default: assert(0); return true;
-		}
-	}
-	else {
-		const void *a = ((char*)this) + socket.struct_offset;
-		const void *b = ((char*)&other) + socket.struct_offset;
-		return (memcmp(a, b, socket.size()) == 0);
+	switch(socket.type) {
+		case SocketType::BOOLEAN: return is_value_equal<bool>(this, &other, socket);
+		case SocketType::FLOAT: return is_value_equal<float>(this, &other, socket);
+		case SocketType::INT: return is_value_equal<int>(this, &other, socket);
+		case SocketType::UINT: return is_value_equal<uint>(this, &other, socket);
+		case SocketType::COLOR: return is_value_equal<float3>(this, &other, socket);
+		case SocketType::VECTOR: return is_value_equal<float3>(this, &other, socket);
+		case SocketType::POINT: return is_value_equal<float3>(this, &other, socket);
+		case SocketType::NORMAL: return is_value_equal<float3>(this, &other, socket);
+		case SocketType::POINT2: return is_value_equal<float2>(this, &other, socket);
+		case SocketType::CLOSURE: return true;
+		case SocketType::STRING: return is_value_equal<ustring>(this, &other, socket);
+		case SocketType::ENUM: return is_value_equal<int>(this, &other, socket);
+		case SocketType::TRANSFORM: return is_value_equal<Transform>(this, &other, socket);
+		case SocketType::NODE: return is_value_equal<void*>(this, &other, socket);
+
+		case SocketType::BOOLEAN_ARRAY: return is_array_equal<bool>(this, &other, socket);
+		case SocketType::FLOAT_ARRAY: return is_array_equal<float>(this, &other, socket);
+		case SocketType::INT_ARRAY: return is_array_equal<int>(this, &other, socket);
+		case SocketType::COLOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
+		case SocketType::VECTOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
+		case SocketType::POINT_ARRAY: return is_array_equal<float3>(this, &other, socket);
+		case SocketType::NORMAL_ARRAY: return is_array_equal<float3>(this, &other, socket);
+		case SocketType::POINT2_ARRAY: return is_array_equal<float2>(this, &other, socket);
+		case SocketType::STRING_ARRAY: return is_array_equal<ustring>(this, &other, socket);
+		case SocketType::TRANSFORM_ARRAY: return is_array_equal<Transform>(this, &other, socket);
+		case SocketType::NODE_ARRAY: return is_array_equal<void*>(this, &other, socket);
+
+		case SocketType::UNDEFINED: return true;
 	}
+
+	return true;
 }
 
 /* equals */
@@ -403,5 +423,77 @@ bool Node::equals(const Node& other) const
 	return true;
 }
 
+/* Hash */
+
+template<typename T>
+static void value_hash(const Node *node, const SocketType& socket, MD5Hash& md5)
+{
+	md5.append(((uint8_t*)node) + socket.struct_offset, socket.size());
+}
+
+static void float3_hash(const Node *node, const SocketType& socket, MD5Hash& md5)
+{
+	/* Don't compare 4th element used for padding. */
+	md5.append(((uint8_t*)node) + socket.struct_offset, sizeof(float) * 3);
+}
+
+template<typename T>
+static void array_hash(const Node *node, const SocketType& socket, MD5Hash& md5)
+{
+	const array<T>& a = *(const array<T>*)(((char*)node) + socket.struct_offset);
+	for (size_t i = 0; i < a.size(); i++) {
+		md5.append((uint8_t*)&a[i], sizeof(T));
+	}
+}
+
+static void float3_array_hash(const Node *node, const SocketType& socket, MD5Hash& md5)
+{
+	/* Don't compare 4th element used for padding. */
+	const array<float3>& a = *(const array<float3>*)(((char*)node) + socket.struct_offset);
+	for (size_t i = 0; i < a.size(); i++) {
+		md5.append((uint8_t*)&a[i], sizeof(float) * 3);
+	}
+}
+
+void Node::hash(MD5Hash& md5)
+{
+	md5.append(type->name.string());
+
+	foreach(const SocketType& socket, type->inputs) {
+		md5.append(socket.name.string());
+
+		switch(socket.type) {
+			case SocketType::BOOLEAN: value_hash<bool>(this, socket, md5); break;
+			case SocketType::FLOAT: value_hash<float>(this, socket, md5); break;
+			case SocketType::INT: value_hash<int>(this, socket, md5); break;
+			case SocketType::UINT: value_hash<uint>(this, socket, md5); break;
+			case SocketType::COLOR: float3_hash(this, socket, md5); break;
+			case SocketType::VECTOR: float3_hash(this, socket, md5); break;
+			case SocketType::POINT: float3_hash(this, socket, md5); break;
+			case SocketType::NORMAL: float3_hash(this, socket, md5); break;
+			case SocketType::POINT2: value_hash<float2>(this, socket, md5); break;
+			case SocketType::CLOSURE: break;
+			case SocketType::STRING: value_hash<ustring>(this, socket, md5); break;
+			case SocketType::ENUM: value_hash<int>(this, socket, md5); break;
+			case SocketType::TRANSFORM: value_hash<Transform>(this, socket, md5); break;
+			case SocketType::NODE: value_hash<void*>(this, socket, md5); break;
+
+			case SocketType::BOOLEAN_ARRAY: array_hash<bool>(this, socket, md5); break;
+			case SocketType::FLOAT_ARRAY: array_hash<float>(this, socket, md5); break;
+			case SocketType::INT_ARRAY: array_hash<int>(this, socket, md5); break;
+			case SocketType::COLOR_ARRAY: float3_array_hash(this, socket, md5); break;
+			case SocketType::VECTOR_ARRAY: float3_array_hash(this, socket, md5); break;
+			case SocketType::POINT_ARRAY: float3_array_hash(this, socket, md5); break;
+			case SocketType::NORMAL_ARRAY: float3_array_hash(this, socket, md5); break;
+			case SocketType::POINT2_ARRAY: array_hash<float2>(this, socket, md5); break;
+			case SocketType::STRING_ARRAY: array_hash<ustring>(this, socket, md5); break;
+			case SocketType::TRANSFORM_ARRAY: array_hash<Transform>(this, socket, md5); break;
+			case SocketType::NODE_ARRAY: array_hash<void*>(this, socket, md5); break;
+
+			case SocketType::UNDEFINED: break;
+		}
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index 64410f4539b..d198c38be32 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,14 +16,15 @@
 
 #pragma once
 
-#include "node_type.h"
+#include "graph/node_type.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class MD5Hash;
 struct Node;
 struct NodeType;
 struct Transform;
@@ -88,6 +89,9 @@ struct Node
 	/* equals */
 	bool equals(const Node& other) const;
 
+	/* compute hash of node and its socket values */
+	void hash(MD5Hash& md5);
+
 	ustring name;
 	const NodeType *type;
 };
diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h
index 2bae531c036..4e40c294f4f 100644
--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "util_map.h"
-#include "util_param.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index 5b98de778ad..37aae211e93 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "node_type.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "graph/node_type.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -77,7 +77,7 @@ size_t SocketType::max_size()
 
 void *SocketType::zero_default_value()
 {
-	static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+	static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
 	return &zero_transform;
 }
 
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index 1fb135f6d22..7d46e31ce24 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "node_enum.h"
+#include "graph/node_enum.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp
index 590e09645ed..f4599e22d40 100644
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "node_xml.h"
+#include "graph/node_xml.h"
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -33,7 +33,7 @@ static const char *xml_write_boolean(bool value)
 }
 
 template<int VECTOR_SIZE, typename T>
-static void xml_read_float_array(T& value, pugi::xml_attribute attr)
+static void xml_read_float_array(T& value, xml_attribute attr)
 {
 	vector<string> tokens;
 	string_split(tokens, attr.value());
@@ -51,9 +51,9 @@ static void xml_read_float_array(T& value, pugi::xml_attribute attr)
 	}
 }
 
-void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
+void xml_read_node(XMLReader& reader, Node *node, xml_node xml_node)
 {
-	pugi::xml_attribute name_attr = xml_node.attribute("name");
+	xml_attribute name_attr = xml_node.attribute("name");
 	if(name_attr) {
 		node->name = ustring(name_attr.value());
 	}
@@ -66,7 +66,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
 			continue;
 		}
 
-		pugi::xml_attribute attr = xml_node.attribute(socket.name.c_str());
+		xml_attribute attr = xml_node.attribute(socket.name.c_str());
 
 		if(!attr) {
 			continue;
@@ -196,7 +196,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
 			case SocketType::TRANSFORM:
 			{
 				array<Transform> value;
-				xml_read_float_array<16>(value, attr);
+				xml_read_float_array<12>(value, attr);
 				if(value.size() == 1) {
 					node->set(socket, value[0]);
 				}
@@ -205,7 +205,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
 			case SocketType::TRANSFORM_ARRAY:
 			{
 				array<Transform> value;
-				xml_read_float_array<16>(value, attr);
+				xml_read_float_array<12>(value, attr);
 				node->set(socket, value);
 				break;
 			}
@@ -254,9 +254,9 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
 		reader.node_map[node->name] = node;
 }
 
-pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root)
+xml_node xml_write_node(Node *node, xml_node xml_root)
 {
-	pugi::xml_node xml_node = xml_root.append_child(node->type->name.c_str());
+	xml_node xml_node = xml_root.append_child(node->type->name.c_str());
 
 	xml_node.append_attribute("name") = node->name.c_str();
 
@@ -271,7 +271,7 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root)
 			continue;
 		}
 
-		pugi::xml_attribute attr = xml_node.append_attribute(socket.name.c_str());
+		xml_attribute attr = xml_node.append_attribute(socket.name.c_str());
 
 		switch(socket.type)
 		{
@@ -400,12 +400,10 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root)
 			{
 				Transform tfm = node->get_transform(socket);
 				std::stringstream ss;
-				for(int i = 0; i < 4; i++) {
-					ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
-					if(i != 3) {
-						ss << " ";
-					}
+				for(int i = 0; i < 3; i++) {
+					ss << string_printf("%g %g %g %g ", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
 				}
+				ss << string_printf("%g %g %g %g", 0.0, 0.0, 0.0, 1.0);
 				attr = ss.str().c_str();
 				break;
 			}
@@ -416,11 +414,12 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root)
 				for(size_t j = 0; j < value.size(); j++) {
 					const Transform& tfm = value[j];
 
-					for(int i = 0; i < 4; i++) {
-						ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
-						if(j != value.size() - 1 || i != 3) {
-							ss << " ";
-						}
+					for(int i = 0; i < 3; i++) {
+						ss << string_printf("%g %g %g %g ", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
+					}
+					ss << string_printf("%g %g %g %g", 0.0, 0.0, 0.0, 1.0);
+					if(j != value.size() - 1) {
+						ss << " ";
 					}
 				}
 				attr = ss.str().c_str();
diff --git a/intern/cycles/graph/node_xml.h b/intern/cycles/graph/node_xml.h
index 7494c5e6e55..b648c9666c1 100644
--- a/intern/cycles/graph/node_xml.h
+++ b/intern/cycles/graph/node_xml.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_string.h"
-#include "util_xml.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
+#include "util/util_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,8 +28,8 @@ struct XMLReader {
 	map<ustring, Node*> node_map;
 };
 
-void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node);
-pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root);
+void xml_read_node(XMLReader& reader, Node *node, xml_node xml_node);
+xml_node xml_write_node(Node *node, xml_node xml_root);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 29e0f44841e..9b7f4e00084 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,45 +1,78 @@
 remove_extra_strict_flags()
 
 set(INC
-	.
-	../util
-	osl
-	svm
+	..
 )
 
 set(INC_SYS
 
 )
 
-set(SRC
+set(SRC_CPU_KERNELS
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
+	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
+	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
+)
+
+set(SRC_CUDA_KERNELS
+	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
+	kernels/cuda/filter.cu
+)
+
+set(SRC_OPENCL_KERNELS
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
+	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_do_volume.cl
+	kernels/opencl/kernel_indirect_background.cl
+	kernels/opencl/kernel_shader_setup.cl
+	kernels/opencl/kernel_shader_sort.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_shadow_blocked_ao.cl
+	kernels/opencl/kernel_shadow_blocked_dl.cl
+	kernels/opencl/kernel_enqueue_inactive.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
-	kernels/cuda/kernel.cu
+	kernels/opencl/kernel_indirect_subsurface.cl
+	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/filter.cl
 )
 
 set(SRC_BVH_HEADERS
 	bvh/bvh.h
 	bvh/bvh_nodes.h
 	bvh/bvh_shadow_all.h
-	bvh/bvh_subsurface.h
+	bvh/bvh_local.h
 	bvh/bvh_traversal.h
 	bvh/bvh_types.h
 	bvh/bvh_volume.h
 	bvh/bvh_volume_all.h
 	bvh/qbvh_nodes.h
 	bvh/qbvh_shadow_all.h
-	bvh/qbvh_subsurface.h
+	bvh/qbvh_local.h
 	bvh/qbvh_traversal.h
 	bvh/qbvh_volume.h
 	bvh/qbvh_volume_all.h
@@ -52,12 +85,10 @@ set(SRC_HEADERS
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
-	kernel_debug.h
 	kernel_differential.h
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
-	kernel_image_opencl.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
@@ -68,6 +99,7 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
+	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -86,6 +118,18 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
 	kernels/cpu/kernel_cpu_image.h
+	kernels/cpu/filter_cpu.h
+	kernels/cpu/filter_cpu_impl.h
+)
+
+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+	kernels/cuda/kernel_cuda_image.h
+)
+
+set(SRC_KERNELS_OPENCL_HEADERS
+	kernels/opencl/kernel_split_function.h
+	kernels/opencl/kernel_opencl_image.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -109,11 +153,14 @@ set(SRC_CLOSURE_HEADERS
 	closure/bssrdf.h
 	closure/emissive.h
 	closure/volume.h
+	closure/bsdf_principled_diffuse.h
+	closure/bsdf_principled_sheen.h
 )
 
 set(SRC_SVM_HEADERS
 	svm/svm.h
 	svm/svm_attribute.h
+	svm/svm_bevel.h
 	svm/svm_blackbody.h
 	svm/svm_bump.h
 	svm/svm_camera.h
@@ -162,6 +209,7 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_curve.h
+	geom/geom_curve_intersect.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_motion_triangle_intersect.h
@@ -175,42 +223,102 @@ set(SRC_GEOM_HEADERS
 	geom/geom_volume.h
 )
 
+set(SRC_FILTER_HEADERS
+	filter/filter.h
+	filter/filter_defines.h
+	filter/filter_features.h
+	filter/filter_features_sse.h
+	filter/filter_kernel.h
+	filter/filter_nlm_cpu.h
+	filter/filter_nlm_gpu.h
+	filter/filter_prefilter.h
+	filter/filter_reconstruction.h
+	filter/filter_transform.h
+	filter/filter_transform_gpu.h
+	filter/filter_transform_sse.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
+	../util/util_defines.h
 	../util/util_half.h
 	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
+	../util/util_math_intersect.h
+	../util/util_math_float2.h
+	../util/util_math_float3.h
+	../util/util_math_float4.h
+	../util/util_math_int2.h
+	../util/util_math_int3.h
+	../util/util_math_int4.h
+	../util/util_math_matrix.h
+	../util/util_projection.h
+	../util/util_rect.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
 	../util/util_types.h
+	../util/util_types_float2.h
+	../util/util_types_float2_impl.h
+	../util/util_types_float3.h
+	../util/util_types_float3_impl.h
+	../util/util_types_float4.h
+	../util/util_types_float4_impl.h
+	../util/util_types_int2.h
+	../util/util_types_int2_impl.h
+	../util/util_types_int3.h
+	../util/util_types_int3_impl.h
+	../util/util_types_int4.h
+	../util/util_types_int4_impl.h
+	../util/util_types_uchar2.h
+	../util/util_types_uchar2_impl.h
+	../util/util_types_uchar3.h
+	../util/util_types_uchar3_impl.h
+	../util/util_types_uchar4.h
+	../util/util_types_uchar4_impl.h
+	../util/util_types_uint2.h
+	../util/util_types_uint2_impl.h
+	../util/util_types_uint3.h
+	../util/util_types_uint3_impl.h
+	../util/util_types_uint4.h
+	../util/util_types_uint4_impl.h
+	../util/util_types_vector3.h
+	../util/util_types_vector3_impl.h
 )
 
 set(SRC_SPLIT_HEADERS
-	split/kernel_background_buffer_update.h
+	split/kernel_branched.h
+	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
+	split/kernel_do_volume.h
+	split/kernel_enqueue_inactive.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_indirect_background.h
+	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
+	split/kernel_shader_setup.h
+	split/kernel_shader_sort.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked.h
+	split/kernel_shadow_blocked_ao.h
+	split/kernel_shadow_blocked_dl.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
+	split/kernel_subsurface_scatter.h
 )
 
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
-	# 32 bit or 64 bit
-	if(CUDA_64_BIT_DEVICE_CODE)
-		set(CUDA_BITS 64)
-	else()
-		set(CUDA_BITS 32)
-	endif()
+	# 64 bit only
+	set(CUDA_BITS 64)
 
 	# CUDA version
 	execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
@@ -219,7 +327,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "80")
+	if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
@@ -227,74 +335,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
 		${SRC_CLOSURE_HEADERS}
 		${SRC_UTIL_HEADERS}
 	)
+	set(cuda_filter_sources kernels/cuda/filter.cu
+		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
+		${SRC_FILTER_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
+		set(cuda_cubin ${name}_${arch}.cubin)
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+
+		set(cuda_flags
+			-D CCL_NAMESPACE_BEGIN=
+			-D CCL_NAMESPACE_END=
+			-D NVCC
+			-m ${CUDA_BITS}
+			-I ${CMAKE_CURRENT_SOURCE_DIR}/..
+			-I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+			--use_fast_math
+			-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+
 		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
-		else()
-			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
+			set(name ${name}_experimental)
 		endif()
 
 		if(WITH_CYCLES_DEBUG)
-			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
-		else()
-			set(cuda_debug_flags "")
+			set(cuda_flags ${cuda_flags} -D __KERNEL_DEBUG__)
 		endif()
 
-		set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
-		set(cuda_nvcc_version ${CUDA_VERSION})
-
-		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
-		set(cuda_math_flags "--use_fast_math")
-
-		add_custom_command(
-			OUTPUT ${cuda_cubin}
-			COMMAND ${cuda_nvcc_command}
-					-arch=${arch}
-					${CUDA_NVCC_FLAGS}
-					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
-					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-					--ptxas-options="-v"
-					${cuda_arch_flags}
-					${cuda_version_flags}
-					${cuda_math_flags}
-					${cuda_extra_flags}
-					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
-					-DCCL_NAMESPACE_BEGIN=
-					-DCCL_NAMESPACE_END=
-					-DNVCC
-			DEPENDS ${cuda_sources})
-
+		if(WITH_CYCLES_CUBIN_COMPILER)
+			string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)
+
+			# Needed to find libnvrtc-builtins.so. Can't do it from inside
+			# cycles_cubin_cc since the env variable is read before main()
+			if(APPLE)
+				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+					-E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+			elseif(UNIX)
+				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+					-E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+			endif()
+
+			add_custom_command(
+				OUTPUT ${cuda_cubin}
+				COMMAND ${CUBIN_CC_ENV}
+						"$<TARGET_FILE:cycles_cubin_cc>"
+						-target ${CUDA_ARCH}
+						-i ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+						${cuda_flags}
+						-v
+						-cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+				DEPENDS ${sources} cycles_cubin_cc)
+		else()
+			add_custom_command(
+				OUTPUT ${cuda_cubin}
+				COMMAND ${CUDA_NVCC_EXECUTABLE}
+						-arch=${arch}
+						${CUDA_NVCC_FLAGS}
+						--cubin
+						${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+						--ptxas-options="-v"
+						${cuda_flags}
+				DEPENDS ${sources})
+		endif()
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})
 
-		unset(cuda_extra_flags)
 		unset(cuda_debug_flags)
-
-		unset(cuda_nvcc_command)
-		unset(cuda_nvcc_version)
 	endmacro()
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		if(${arch} MATCHES "sm_2.")
+			message(STATUS "CUDA binaries for ${arch} are no longer supported, skipped.")
+		else()
+			# Compile regular kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+		endif()
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D __SPLIT__" ${cuda_sources} FALSE)
+		endif()
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
+	cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
 # OSL module
@@ -309,38 +447,45 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-if(CXX_HAS_SSE)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-	)
+set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
+if(CXX_HAS_SSE)
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx2.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
-add_library(cycles_kernel
-	${SRC}
+cycles_add_library(cycles_kernel
+	${SRC_CPU_KERNELS}
+	${SRC_CUDA_KERNELS}
+	${SRC_OPENCL_KERNELS}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
+	${SRC_KERNELS_OPENCL_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
+	${SRC_FILTER_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
@@ -360,24 +505,16 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
 
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 321983c1abc..d3e0b25a200 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -27,76 +27,76 @@
 
 CCL_NAMESPACE_BEGIN
 
-#include "bvh_types.h"
+#include "kernel/bvh/bvh_types.h"
 
 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "qbvh_nodes.h"
+#  include "kernel/bvh/qbvh_nodes.h"
 #endif
 
 /* Regular BVH traversal */
 
-#include "bvh_nodes.h"
+#include "kernel/bvh/bvh_nodes.h"
 
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "bvh_traversal.h"
+#include "kernel/bvh/bvh_traversal.h"
 
 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 /* Subsurface scattering BVH traversal */
 
-#if defined(__SUBSURFACE__)
-#  define BVH_FUNCTION_NAME bvh_intersect_subsurface
+#if defined(__BVH_LOCAL__)
+#  define BVH_FUNCTION_NAME bvh_intersect_local
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_subsurface.h"
+#  include "kernel/bvh/bvh_local.h"
 
 #  if defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+#    define BVH_FUNCTION_NAME bvh_intersect_local_motion
 #    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#    include "bvh_subsurface.h"
+#    include "kernel/bvh/bvh_local.h"
 #  endif
-#endif  /* __SUBSURFACE__ */
+#endif  /* __BVH_LOCAL__ */
 
 /* Volume BVH traversal */
 
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume.h"
+#  include "kernel/bvh/bvh_volume.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 #endif  /* __VOLUME__ */
 
@@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "bvh_shadow_all.h"
+#  include "kernel/bvh/bvh_shadow_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume_all.h"
+#  include "kernel/bvh/bvh_volume_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 #endif  /* __VOLUME_RECORD_ALL__ */
 
@@ -201,58 +201,92 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 #endif /* __KERNEL_CPU__ */
 }
 
-#ifdef __SUBSURFACE__
-ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
-                                                     SubsurfaceIntersection *ss_isect,
-                                                     int subsurface_object,
-                                                     uint *lcg_state,
-                                                     int max_hits)
+#ifdef __BVH_LOCAL__
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
+ccl_device_intersect void scene_intersect_local(KernelGlobals *kg,
+                                                const Ray ray,
+                                                LocalIntersection *local_isect,
+                                                int local_object,
+                                                uint *lcg_state,
+                                                int max_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
-		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
-		                                       ss_isect,
-		                                       subsurface_object,
-		                                       lcg_state,
-		                                       max_hits);
+		return bvh_intersect_local_motion(kg,
+		                                  &ray,
+		                                  local_isect,
+		                                  local_object,
+		                                  lcg_state,
+		                                  max_hits);
 	}
 #endif /* __OBJECT_MOTION__ */
-	return bvh_intersect_subsurface(kg,
-	                                ray,
-	                                ss_isect,
-	                                subsurface_object,
-	                                lcg_state,
-	                                max_hits);
+	return bvh_intersect_local(kg,
+	                            &ray,
+	                            local_isect,
+	                            local_object,
+	                            lcg_state,
+	                            max_hits);
 }
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     uint *num_hits)
 {
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+		if(kernel_data.bvh.have_curves) {
+			return bvh_intersect_shadow_all_hair_motion(kg,
+			                                            ray,
+			                                            isect,
+			                                            visibility,
+			                                            max_hits,
+			                                            num_hits);
+		}
 #    endif /* __HAIR__ */
 
-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+		return bvh_intersect_shadow_all_motion(kg,
+		                                       ray,
+		                                       isect,
+		                                       visibility,
+		                                       max_hits,
+		                                       num_hits);
 	}
 #  endif /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_curves) {
+		return bvh_intersect_shadow_all_hair(kg,
+		                                     ray,
+		                                     isect,
+		                                     visibility,
+		                                     max_hits,
+		                                     num_hits);
+	}
 #  endif /* __HAIR__ */
 
 #  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_instancing) {
+		return bvh_intersect_shadow_all_instancing(kg,
+		                                           ray,
+		                                           isect,
+		                                           visibility,
+		                                           max_hits,
+		                                           num_hits);
+	}
 #  endif /* __INSTANCING__ */
 
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_shadow_all(kg,
+	                                ray,
+	                                isect,
+	                                visibility,
+	                                max_hits,
+	                                num_hits);
 }
 #endif  /* __SHADOW_RECORD_ALL__ */
 
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_local.h
index 889bbca21e2..9292cc76a5c 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_subsurface.h"
+#  include "kernel/bvh/qbvh_local.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -27,9 +27,10 @@
 #  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif
 
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
+/* This is a template BVH traversal function for finding local intersections
+ * around the shading point, for subsurface scattering and bevel. We disable
+ * various features for performance, and for instanced objects avoid traversing
+ * other parts of the scene.
  *
  * BVH_MOTION: motion blur rendering
  *
@@ -42,8 +43,8 @@ ccl_device_inline
 #endif
 void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
-                                 SubsurfaceIntersection *ss_isect,
-                                 int subsurface_object,
+                                 LocalIntersection *local_isect,
+                                 int local_object,
                                  uint *lcg_state,
                                  int max_hits)
 {
@@ -60,7 +61,7 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 	/* traversal variables in registers */
 	int stack_ptr = 0;
-	int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
+	int node_addr = kernel_tex_fetch(__object_node, local_object);
 
 	/* ray parameters in registers */
 	float3 P = ray->P;
@@ -69,24 +70,24 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	int object = OBJECT_NONE;
 	float isect_t = ray->t;
 
-	ss_isect->num_hits = 0;
+	local_isect->num_hits = 0;
 
-	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+	const int object_flag = kernel_tex_fetch(__object_flag, local_object);
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   local_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
 #endif
-		object = subsurface_object;
+		object = local_object;
 	}
 
 #if defined(__KERNEL_SSE2__)
@@ -109,9 +110,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -196,15 +194,16 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						/* intersect ray against primitive */
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
-							                              ss_isect,
-							                              P,
-							                              object,
-							                              prim_addr,
-							                              isect_t,
-							                              lcg_state,
-							                              max_hits);
+							triangle_intersect_local(kg,
+							                         local_isect,
+							                         P,
+							                         dir,
+							                         object,
+							                         local_object,
+							                         prim_addr,
+							                         isect_t,
+							                         lcg_state,
+							                         max_hits);
 						}
 						break;
 					}
@@ -213,16 +212,17 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						/* intersect ray against primitive */
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-							motion_triangle_intersect_subsurface(kg,
-							                                     ss_isect,
-							                                     P,
-							                                     dir,
-							                                     ray->time,
-							                                     object,
-							                                     prim_addr,
-							                                     isect_t,
-							                                     lcg_state,
-							                                     max_hits);
+							motion_triangle_intersect_local(kg,
+							                                local_isect,
+							                                P,
+							                                dir,
+							                                ray->time,
+							                                object,
+							                                local_object,
+							                                prim_addr,
+							                                isect_t,
+							                                lcg_state,
+							                                max_hits);
 						}
 						break;
 					}
@@ -238,31 +238,30 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
-                                         SubsurfaceIntersection *ss_isect,
-                                         int subsurface_object,
+                                         LocalIntersection *local_isect,
+                                         int local_object,
                                          uint *lcg_state,
                                          int max_hits)
 {
+	switch(kernel_data.bvh.bvh_layout) {
 #ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    ss_isect,
-		                                    subsurface_object,
-		                                    lcg_state,
-		                                    max_hits);
-	}
-	else
+		case BVH_LAYOUT_BVH4:
+			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+			                                    ray,
+			                                    local_isect,
+			                                    local_object,
+			                                    lcg_state,
+			                                    max_hits);
 #endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   ss_isect,
-		                                   subsurface_object,
-		                                   lcg_state,
-		                                   max_hits);
+		case BVH_LAYOUT_BVH2:
+			return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+			                                   ray,
+			                                   local_isect,
+			                                   local_object,
+			                                   lcg_state,
+			                                   max_hits);
 	}
+	kernel_assert(!"Should not happen");
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 726bef1794c..060b3934a41 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -17,26 +17,25 @@
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int node_addr,
-                                                           int child)
+                                                                int node_addr,
+                                                                int child)
 {
 	Transform space;
 	const int child_addr = node_addr + child * 3;
 	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
 	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
 	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
-	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
 	return space;
 }
 
 #if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int node_addr,
-                                                 const uint visibility,
-                                                 float dist[2])
+                                                      const float3 P,
+                                                      const float3 idir,
+                                                      const float t,
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {
 
 	/* fetch node data */
@@ -52,8 +51,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -61,8 +60,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	dist[0] = c0min;
 	dist[1] = c1min;
@@ -78,14 +77,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
+                                                             const float3 P,
+                                                             const float3 idir,
+                                                             const float t,
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int node_addr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {
 
 	/* fetch node data */
@@ -101,8 +100,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -110,8 +109,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	if(difl != 0.0f) {
 		float hdiff = 1.0f + difl;
@@ -203,13 +202,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -233,15 +232,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float3 idir,
+                                                               const float t,
+                                                               const float difl,
+                                                               const float extmax,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -265,13 +264,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3 P,
+                                              const float3 dir,
+                                              const float3 idir,
+                                              const float t,
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -296,15 +295,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3 P,
+                                                     const float3 dir,
+                                                     const float3 idir,
+                                                     const float t,
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -442,19 +441,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& isect_near,
-                                                   const ssef& isect_far,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const ssef& isect_near,
+                                                        const ssef& isect_far,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -483,8 +482,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask = tnear <= tfar;
 	dist[0] = tnear.f[0];
 	dist[1] = tnear.f[1];
@@ -503,20 +502,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& isect_near,
-                                                          const ssef& isect_far,
-                                                          const float difl,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const ssef& isect_near,
+                                                               const ssef& isect_far,
+                                                               const float difl,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -545,8 +544,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask;
 	if(difl != 0.0f) {
 		const float round_down = 1.0f - difl;
@@ -574,17 +573,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& isect_near,
-                                         const ssef& isect_far,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3& P,
+                                              const float3& dir,
+                                              const ssef& isect_near,
+                                              const ssef& isect_far,
+                                              const ssef& tsplat,
+                                              const ssef Psplat[3],
+                                              const ssef idirsplat[3],
+                                              const shuffle_swap_t shufflexyz[3],
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -612,19 +611,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& isect_near,
-                                                const ssef& isect_far,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3& P,
+                                                     const float3& dir,
+                                                     const ssef& isect_near,
+                                                     const ssef& isect_far,
+                                                     const ssef& tsplat,
+                                                     const ssef Psplat[3],
+                                                     const ssef idirsplat[3],
+                                                     const shuffle_swap_t shufflexyz[3],
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index df33a86bb18..cfc567ff9ca 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_shadow_all.h"
+#  include "kernel/bvh/qbvh_shadow_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -45,6 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
                                  Intersection *isect_array,
+                                 const uint visibility,
                                  const uint max_hits,
                                  uint *num_hits)
 {
@@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -121,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idir,
 				                               isect_t,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #else // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
@@ -136,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idirsplat,
 				                               shufflexyz,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #endif // __KERNEL_SSE2__
 
@@ -188,7 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* primitive intersection */
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -198,10 +195,10 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
-								                         PATH_RAY_SHADOW,
+								                         dir,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -213,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -224,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
@@ -279,7 +276,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								shader = __float_as_int(str.z);
 							}
 #endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
+							int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 
 							/* if no transparent shadows, all light is blocked */
 							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -309,12 +306,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;
 
@@ -354,22 +350,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
 
-				triangle_intersect_precalc(dir, &isect_precalc);
-
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
@@ -400,27 +391,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
+                                         const uint visibility,
                                          const uint max_hits,
                                          uint *num_hits)
 {
+	switch(kernel_data.bvh.bvh_layout) {
 #ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect_array,
-		                                    max_hits,
-		                                    num_hits);
-	}
-	else
+		case BVH_LAYOUT_BVH4:
+			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    visibility,
+			                                    max_hits,
+			                                    num_hits);
 #endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect_array,
-		                                   max_hits,
-		                                   num_hits);
+		case BVH_LAYOUT_BVH2:
+			return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+			                                   ray,
+			                                   isect_array,
+			                                   visibility,
+			                                   max_hits,
+			                                   num_hits);
 	}
+	kernel_assert(!"Should not happen");
+	return false;
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 80c8f31473a..551625eae78 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_traversal.h"
+#  include "kernel/bvh/qbvh_traversal.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -238,23 +235,23 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr))
 								{
 									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #  endif
 #else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #endif
 								}
@@ -277,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								{
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
@@ -301,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
@@ -354,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+					isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+					isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -391,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
@@ -431,34 +426,34 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 #endif
                                          )
 {
+	switch(kernel_data.bvh.bvh_layout) {
 #ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect,
-		                                    visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-		                                    , lcg_state,
-		                                    difl,
-		                                    extmax
-#endif
-		                                    );
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect,
-		                                   visibility
+		case BVH_LAYOUT_BVH4:
+			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+			                                    , lcg_state,
+			                                    difl,
+			                                    extmax
+#  endif
+			                                    );
+#endif  /* __QBVH__ */
+		case BVH_LAYOUT_BVH2:
+			return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+			                                   ray,
+			                                   isect,
+			                                   visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-		                                   , lcg_state,
-		                                   difl,
-		                                   extmax
+			                                   , lcg_state,
+			                                   difl,
+			                                   extmax
 #endif
-		                                   );
+			                                   );
 	}
+	kernel_assert(!"Should not happen");
+	return false;
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 57e5b8d736d..ce5fc7be33d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume.h"
+#  include "kernel/bvh/qbvh_volume.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -194,9 +191,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								triangle_intersect(kg,
-								                   &isect_precalc,
 								                   isect,
 								                   P,
+								                   dir,
 								                   visibility,
 								                   object,
 								                   prim_addr);
@@ -238,13 +235,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
 						Psplat[1] = ssef(P.y);
@@ -281,13 +276,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
@@ -316,22 +309,22 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          Intersection *isect,
                                          const uint visibility)
 {
+	switch(kernel_data.bvh.bvh_layout) {
 #ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect,
-		                                    visibility);
-	}
-	else
+		case BVH_LAYOUT_BVH4:
+			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility);
 #endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect,
-		                                   visibility);
+		case BVH_LAYOUT_BVH2:
+			return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+			                                   ray,
+			                                   isect,
+			                                   visibility);
 	}
+	kernel_assert(!"Should not happen");
+	return false;
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 5a1accebaa0..f2379efc656 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume_all.h"
+#  include "kernel/bvh/qbvh_volume_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -101,9 +101,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -199,9 +196,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         visibility,
 								                         object,
 								                         prim_addr);
@@ -288,14 +285,12 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -341,20 +336,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 				/* Scale isect->t to adjust for instancing. */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
@@ -389,24 +381,24 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          const uint visibility)
 {
+	switch(kernel_data.bvh.bvh_layout) {
 #ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect_array,
-		                                    max_hits,
-		                                    visibility);
-	}
-	else
+		case BVH_LAYOUT_BVH4:
+			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    max_hits,
+			                                    visibility);
 #endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect_array,
-		                                   max_hits,
-		                                   visibility);
+		case BVH_LAYOUT_BVH2:
+			return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+			                                   ray,
+			                                   isect_array,
+			                                   max_hits,
+			                                   visibility);
 	}
+	kernel_assert(!"Should not happen");
+	return 0;
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_local.h
index 84dc4003133..2386fa1a1e8 100644
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_local.h
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
+/* This is a template BVH traversal function for finding local intersections
+ * around the shading point, for subsurface scattering and bevel. We disable
+ * various features for performance, and for instanced objects avoid traversing
+ * other parts of the scene.
  *
  * BVH_MOTION: motion blur rendering
  *
@@ -30,8 +31,8 @@
 
 ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
-                                             SubsurfaceIntersection *ss_isect,
-                                             int subsurface_object,
+                                             LocalIntersection *local_isect,
+                                             int local_object,
                                              uint *lcg_state,
                                              int max_hits)
 {
@@ -49,7 +50,7 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 	/* Traversal variables in registers. */
 	int stack_ptr = 0;
-	int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
+	int node_addr = kernel_tex_fetch(__object_node, local_object);
 
 	/* Ray parameters in registers. */
 	float3 P = ray->P;
@@ -58,24 +59,24 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	int object = OBJECT_NONE;
 	float isect_t = ray->t;
 
-	ss_isect->num_hits = 0;
+	local_isect->num_hits = 0;
 
-	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+	const int object_flag = kernel_tex_fetch(__object_flag, local_object);
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   local_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
 #endif
-		object = subsurface_object;
+		object = local_object;
 	}
 
 #ifndef __KERNEL_SSE41__
@@ -105,9 +106,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -252,15 +250,16 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						/* Intersect ray against primitive, */
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
-							                              ss_isect,
-							                              P,
-							                              object,
-							                              prim_addr,
-							                              isect_t,
-							                              lcg_state,
-							                              max_hits);
+							triangle_intersect_local(kg,
+							                         local_isect,
+							                         P,
+							                         dir,
+							                         object,
+							                         local_object,
+							                         prim_addr,
+							                         isect_t,
+							                         lcg_state,
+							                         max_hits);
 						}
 						break;
 					}
@@ -269,16 +268,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						/* Intersect ray against primitive. */
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-							motion_triangle_intersect_subsurface(kg,
-							                                     ss_isect,
-							                                     P,
-							                                     dir,
-							                                     ray->time,
-							                                     object,
-							                                     prim_addr,
-							                                     isect_t,
-							                                     lcg_state,
-							                                     max_hits);
+							motion_triangle_intersect_local(kg,
+							                                local_isect,
+							                                P,
+							                                dir,
+							                                ray->time,
+							                                object,
+							                                local_object,
+							                                prim_addr,
+							                                isect_t,
+							                                lcg_state,
+							                                max_hits);
 						}
 						break;
 					}
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6d22f0b0d6a..3036efd4198 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg
 	const sseb vmask = cast(tnear) > cast(tfar);
 	int mask = (int)movemask(vmask)^0xf;
 #else
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = tnear <= tfar;
 	int mask = (int)movemask(vmask);
 #endif
@@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
 
 	const float round_down = 1.0f - difl;
 	const float round_up = 1.0f + difl;
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = round_down*tnear <= round_up*tfar;
 	*dist = tnear;
 	return (int)movemask(vmask);
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 607295f9ed5..46fd178aed6 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,6 +33,7 @@
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
+                                             const uint visibility,
                                              const uint max_hits,
                                              uint *num_hits)
 {
@@ -96,19 +97,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
-				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+				   || ((__float_as_uint(inodes.x) & visibility) == 0)
 #endif
 #if BVH_FEATURE(BVH_MOTION)
 				   || UNLIKELY(ray->time < inodes.y)
@@ -245,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(node_addr < 0) {
 				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
 #ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
@@ -269,7 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Primitive intersection. */
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -279,10 +277,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
-								                         PATH_RAY_SHADOW,
+								                         dir,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -294,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -305,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
@@ -360,7 +358,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								shader = __float_as_int(str.z);
 							}
 #endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
+							int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 
 							/* if no transparent shadows, all light is blocked */
 							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -390,9 +388,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 					num_hits_in_instance = 0;
@@ -414,8 +412,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -445,11 +441,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -472,8 +467,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 10ae7bee852..335a4afd47a 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -106,15 +106,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -122,8 +120,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				   || UNLIKELY(ray->time > inodes.z)
 #endif
 #ifdef __VISIBILITY_FLAG__
-				   || (__float_as_uint(inodes.x) & visibility) == 0)
+				   || (__float_as_uint(inodes.x) & visibility) == 0
 #endif
+				 )
 				{
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -333,15 +332,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -363,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                             prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -380,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -447,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -468,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -489,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			node_dist = traversal_stack[stack_ptr].dist;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index dc6627e2dbb..192ce009524 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -266,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
 							}
 							break;
 						}
@@ -295,9 +292,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -316,8 +313,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 						++stack_ptr;
 						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -341,9 +336,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -362,8 +357,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index ff1fa92af6e..ac5f58a9a51 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -271,7 +268,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
 								if(hit) {
 									/* Move on to next entry in intersections array. */
 									isect_array++;
@@ -346,9 +343,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -367,7 +364,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -406,11 +402,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -433,8 +428,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..b903aeb8073 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,16 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
-	if(num_closure + num_closure_extra >= MAX_CLOSURE)
+	if(sd->num_closure_left == 0)
 		return NULL;
 
-	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+	ShaderClosure *sc = &sd->closure[sd->num_closure];
 
 	sc->type = type;
 	sc->weight = weight;
 
-	ccl_fetch(sd, num_closure)++;
+	sd->num_closure++;
+	sd->num_closure_left--;
 
 	return sc;
 }
@@ -44,25 +43,23 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
 
-	if(num_closure + num_closure_extra > MAX_CLOSURE) {
-		/* Remove previous closure. */
-		ccl_fetch(sd, num_closure)--;
-		ccl_fetch(sd, num_closure_extra)++;
+	if(num_extra > sd->num_closure_left) {
+		/* Remove previous closure if it was allocated. */
+		sd->num_closure--;
+		sd->num_closure_left++;
 		return NULL;
 	}
 
-	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
-	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+	sd->num_closure_left -= num_extra;
+	return (ccl_addr_space void*)(sd->closure + sd->num_closure + sd->num_closure_left);
 }
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
 
-	if(!sc)
+	if(sc == NULL)
 		return NULL;
 
 	float sample_weight = fabsf(average(weight));
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 7e4d5fe2e37..d8ff69ca241 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,126 +14,178 @@
  * limitations under the License.
  */
 
-#include "../closure/bsdf_ashikhmin_velvet.h"
-#include "../closure/bsdf_diffuse.h"
-#include "../closure/bsdf_oren_nayar.h"
-#include "../closure/bsdf_phong_ramp.h"
-#include "../closure/bsdf_diffuse_ramp.h"
-#include "../closure/bsdf_microfacet.h"
-#include "../closure/bsdf_microfacet_multi.h"
-#include "../closure/bsdf_reflection.h"
-#include "../closure/bsdf_refraction.h"
-#include "../closure/bsdf_transparent.h"
-#include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_toon.h"
-#include "../closure/bsdf_hair.h"
-#ifdef __SUBSURFACE__
-#  include "../closure/bssrdf.h"
-#endif
-#ifdef __VOLUME__
-#  include "../closure/volume.h"
-#endif
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
 
+/* Returns the square of the roughness of the closure if it has roughness,
+ * 0 for singular closures and 1 otherwise. */
+ccl_device_inline float bsdf_get_specular_roughness_squared(const ShaderClosure *sc)
+{
+	if(CLOSURE_IS_BSDF_SINGULAR(sc->type)) {
+		return 0.0f;
+	}
+
+	if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc;
+		return bsdf->alpha_x*bsdf->alpha_y;
+	}
+
+	return 1.0f;
+}
+
+ccl_device_inline float bsdf_get_roughness_squared(const ShaderClosure *sc)
+{
+	/* This version includes diffuse, mainly for baking Principled BSDF
+	 * where specular and metallic zero otherwise does not bake the
+	 * specified roughness parameter. */
+	if(sc->type == CLOSURE_BSDF_OREN_NAYAR_ID) {
+		OrenNayarBsdf *bsdf = (OrenNayarBsdf*)sc;
+		return sqr(sqr(bsdf->roughness));
+	}
+
+	if(sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
+		PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)sc;
+		return sqr(sqr(bsdf->roughness));
+	}
+
+	if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+		return 0.0f;
+	}
+
+	return bsdf_get_specular_roughness_squared(sc);
+}
+
 ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  const ShaderClosure *sc,
-                                  float randu,
-                                  float randv,
-                                  float3 *eval,
-                                  float3 *omega_in,
-                                  differential3 *domega_in,
-                                  float *pdf)
+                                       ShaderData *sd,
+                                       const ShaderClosure *sc,
+                                       float randu,
+                                       float randv,
+                                       float3 *eval,
+                                       float3 *omega_in,
+                                       differential3 *domega_in,
+                                       float *pdf)
 {
 	int label;
 
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+		case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+			label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -141,6 +193,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 			break;
 	}
 
+	/* Test if BSDF sample should be treated as transparent for background. */
+	if(label & LABEL_TRANSMIT) {
+		float threshold_squared = kernel_data.background.transparent_roughness_squared_threshold;
+
+		if(threshold_squared >= 0.0f) {
+			if(bsdf_get_specular_roughness_squared(sc) <= threshold_squared) {
+				label |= LABEL_TRANSMIT_TRANSPARENT;
+			}
+		}
+	}
+
 	return label;
 }
 
@@ -157,75 +220,89 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;
 
-	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+	if(dot(sd->Ng, omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -237,63 +314,77 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -311,11 +402,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #ifdef __SVM__
 	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 			bsdf_microfacet_multi_ggx_blur(sc, roughness);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 			bsdf_microfacet_ggx_blur(sc, roughness);
 			break;
@@ -349,10 +445,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_REFLECTION_ID:
 		case CLOSURE_BSDF_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
@@ -367,6 +468,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 			return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			return bsdf_principled_diffuse_merge(a, b);
+#endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
 			return volume_henyey_greenstein_merge(a, b);
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
 
 	float sigma;
 	float invsigma2;
-	float3 N;
 } VelvetBsdf;
 
 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct DiffuseBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } DiffuseBsdf;
 
 /* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct DiffuseRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float3 *colors;
 } DiffuseRampBsdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 4a1316fd2a9..2dd59354058 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -36,7 +36,8 @@
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct MicrofacetExtra {
-	float3 color;
+	float3 color, cspec0;
+	float clearcoat;
 } MicrofacetExtra;
 
 typedef ccl_addr_space struct MicrofacetBsdf {
@@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
 	float alpha_x, alpha_y, ior;
 	MicrofacetExtra *extra;
 	float3 T;
-	float3 N;
 } MicrofacetBsdf;
 
 /* Beckmann and GGX microfacet importance sampling. */
@@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 } 
 
+/* Calculate the reflection color
+ *
+ * If fresnel is used, the color is an interpolation of the F0 color and white
+ * with respect to the fresnel
+ *
+ * Else it is simply white
+ */
+ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) {
+	float3 F = make_float3(1.0f, 1.0f, 1.0f);
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+	if(use_fresnel) {
+		float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+
+		F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+	}
+
+	return F;
+}
+
+ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
+{
+	if(alpha >= 1.0f) return M_1_PI_F;
+	float alpha2 = alpha*alpha;
+	float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH;
+	return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
+}
+
 /* GGX microfacet with Smith shadow-masking from:
  *
  * Microfacet Models for Refraction through Rough Surfaces
@@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
 {
 	const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
@@ -266,23 +334,47 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
-	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
+	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
+	         (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
+	         (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
 }
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = saturate(bsdf->alpha_y);
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = saturate(bsdf->alpha_y);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
 
@@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 			float cosThetaM2 = cosThetaM * cosThetaM;
 			float cosThetaM4 = cosThetaM2 * cosThetaM2;
 			float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
-			D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+			if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+				/* use GTR1 for clearcoat */
+				D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+				/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+				alpha2 = 0.0625f;
+			}
+			else {
+				/* use GTR2 otherwise */
+				D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+			}
 
 			/* eq. 34: now calculate G1(i,m) and G1(o,m) */
 			G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
@@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 
 		/* eq. 20 */
 		float common = D * 0.25f / cosNO;
-		float out = G * common;
+
+		float3 F = reflection_color(bsdf, omega_in, m);
+		if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+			F *= 0.25f * bsdf->extra->clearcoat;
+		}
+
+		float3 out = F * G * common;
 
 		/* eq. 2 in distribution of visible normals sampling
 		 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
@@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		 * pdf = pm * 0.25 / dot(m, I); */
 		*pdf = G1o * common;
 
-		return make_float3(out, out, out);
+		return out;
 	}
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -489,6 +598,16 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+
+						bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+						/* if fresnel is used, calculate the color with reflection_color(...) */
+						if(use_fresnel) {
+							*eval *= reflection_color(bsdf, *omega_in, m);
+						}
+
 						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
@@ -502,13 +621,27 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 							float cosThetaM2 = cosThetaM * cosThetaM;
 							float cosThetaM4 = cosThetaM2 * cosThetaM2;
 							float tanThetaM2 = 1/(cosThetaM2) - 1;
-							D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
 
 							/* eval BRDF*cosNI */
 							float cosNI = dot(N, *omega_in);
 
+							if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+								/* use GTR1 for clearcoat */
+								D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+								/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+								alpha2 = 0.0625f;
+
+								/* recalculate G1o */
+								G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+							}
+							else {
+								/* use GTR2 otherwise */
+								D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+							}
+
 							/* eq. 34: now calculate G1(i,m) */
-							G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+							G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
 						}
 						else {
 							/* anisotropic distribution */
@@ -538,10 +671,15 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 						/* see eval function for derivation */
 						float common = (G1o * D) * 0.25f / cosNO;
-						float out = G1i * common;
 						*pdf = common;
 
-						*eval = make_float3(out, out, out);
+						float3 F = reflection_color(bsdf, *omega_in, m);
+
+						*eval = G1i * common * F;
+					}
+
+					if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+						*eval *= 0.25f * bsdf->extra->clearcoat;
 					}
 
 #ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index cea59adfebe..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
 {
-	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / (1.0f - randU.x));
-		const float phi = M_2PI_F * randU.y;
+	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
+		const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+		const float phi = M_2PI_F * randy;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
 
-	const float sinI = sqrtf(1.0f - cosI*cosI);
+	const float sinI = safe_sqrtf(1.0f - cosI*cosI);
 	const float tanI = sinI/cosI;
 	const float projA = 0.5f * (cosI + 1.0f);
 	if(projA < 0.0001f)
 		return make_float2(0.0f, 0.0f);
-	const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+	const float A = 2.0f*randx*projA / cosI - 1.0f;
 	float tmp = A*A-1.0f;
 	if(fabsf(tmp) < 1e-7f)
 		return make_float2(0.0f, 0.0f);
@@ -64,26 +64,26 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
 	const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
 
 	float U2;
-	if(randU.y >= 0.5f)
-		U2 = 2.0f*(randU.y - 0.5f);
+	if(randy >= 0.5f)
+		U2 = 2.0f*(randy - 0.5f);
 	else
-		U2 = 2.0f*(0.5f - randU.y);
+		U2 = 2.0f*(0.5f - randy);
 	const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
 	const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
 
-	if(randU.y >= 0.5f)
+	if(randy >= 0.5f)
 		return make_float2(slopeX, slopeY);
 	else
 		return make_float2(slopeX, -slopeY);
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
-	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
 
-	const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y));
+	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
 	const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y);
 
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 }
 
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
 
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
 {
-	if(n && k)
-		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
 	return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
 	else
 		phase *= D_ggx_aniso(wh, alpha);
 
-	if(n && k) {
-		/* Apply conductive fresnel term. */
-		return phase * fresnel_conductor(dotW_WH, *n, *k);
-	}
-
 	return make_float3(phase, phase, phase);
 }
 
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
-	float3 tm, bm;
-	make_orthonormals(wm, &tm, &bm);
-
-	float2 disk = concentric_sample_disk(randu, randv);
-	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
-	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
-	return make_float3(v, v, v);
-}
-
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
 ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
@@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
 	return saturate(albedo);
 }
 
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+	if(ior < 1.0f) {
+		ior = 1.0f/ior;
+	}
+	a = saturate(a);
+	ior = clamp(ior, 1.0f, 3.0f);
+	float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+	float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+	float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+	float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+	return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
 ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+	float multiscatter = wo.z * M_1_PI_F;
+
 	float albedo = mf_ggx_albedo(alpha);
-	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
-	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
-}
+	float D = D_ggx_aniso(normalize(wi+wo), alpha);
+	float lambda = mf_lambda(wi, alpha);
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
 
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
-	return M_1_PI_F * wo.z;
+	float multiscatter = wo.z * M_1_PI_F;
+
+	float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
-	float3 wh;
-	float fresnel;
-	if(wi.z*wo.z > 0.0f) {
-		wh = normalize(wi + wo);
-		fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
-	else {
-		wh = normalize(wi + wo*eta);
-		fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
+	bool reflective = (wi.z*wo.z > 0.0f);
+
+	float wh_len;
+	float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
 	if(wh.z < 0.0f)
 		wh = -wh;
 	float3 r_wi = (wi.z < 0.0f)? -wi: wi;
-	return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+	float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+	float D = D_ggx(wh, alpha);
+	float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+	float multiscatter = fabsf(wo.z * M_1_PI_F);
+	if(reflective) {
+		float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+		float albedo = mf_ggx_albedo(alpha);
+		return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
+	else {
+		float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+		float albedo = mf_ggx_transmission_albedo(alpha, eta);
+		return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
 }
 
 /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -313,18 +318,11 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 
 #define MF_PHASE_FUNCTION glass
 #define MF_MULTI_GLASS
-#include "bsdf_microfacet_multi_impl.h"
-
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "bsdf_microfacet_multi_impl.h"
-#endif
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness)
 {
@@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
 	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
 	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
 	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-
-	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
@@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 	if(is_zero(bsdf->T))
 		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	if(is_zero(bsdf->T))
+		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 {
 	bsdf->alpha_y = bsdf->alpha_x;
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	float3 X, Y, Z;
 	Z = bsdf->N;
@@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
 		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
-	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 		*omega_in = 2*dot(Z, I)*Z - I;
 		*pdf = 1e6f;
 		*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+		*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+		*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 		return LABEL_REFLECT|LABEL_SINGULAR;
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	if(is_aniso)
 		make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
@@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	if(is_aniso)
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
@@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
 #ifdef __RAY_DIFFERENTIALS__
 	*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
 	*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
@@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = bsdf->alpha_x;
+	bsdf->ior = max(0.0f, bsdf->ior);
+	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
+}
+
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 
@@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color);
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	float3 X, Y, Z;
 	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
@@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
 		}
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
 	*eval *= *pdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 8054fa8e849..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,19 +26,16 @@
  * the balance heuristic isn't necessarily optimal anymore.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
-        float3 wi,
-        float3 wo,
-        const bool wo_outside,
-        const float3 color,
-        const float alpha_x,
-        const float alpha_y,
-         ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-        , const float eta
-#elif defined(MF_MULTI_GLOSSY)
-        , float3 *n, float3 *k
-#endif
-)
+	float3 wi,
+	float3 wo,
+	const bool wo_outside,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
 	bool swapped = false;
@@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 
 	/* Analytically compute single scattering for lower noise. */
 	float3 eval;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	const float3 wh = normalize(wi+wo);
 #ifdef MF_MULTI_GLASS
 	eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
 	if(wo_outside)
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#elif defined(MF_MULTI_DIFFUSE)
-	/* Diffuse has no special closed form for the single scattering bounce */
-	eval = make_float3(0.0f, 0.0f, 0.0f);
 #else /* MF_MULTI_GLOSSY */
-	const float3 wh = normalize(wi+wo);
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
 		val *= D_ggx(wh, alpha.x);
 	else
 		val *= D_ggx_aniso(wh, alpha);
-	if(n && k) {
-		eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
-	}
-	else {
-		eval = make_float3(val, val, val);
-	}
+	eval = make_float3(val, val, val);
 #endif
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+
+		eval *= throughput;
+	}
+
 	float3 wr = -wi;
 	float hr = 1.0f;
 	float C1_r = 1.0f;
 	float G1_r = 0.0f;
 	bool outside = true;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
 	for(int order = 0; order < 10; order++) {
-		/* Sample microfacet height and normal */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+		/* Sample microfacet height. */
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
 			break;
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
-
-#ifdef MF_MULTI_DIFFUSE
-		if(order == 0) {
-			/* Compute single-scattering for diffuse. */
-			const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
-			eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
+		/* Sample microfacet normal. */
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
+
+#ifdef MF_MULTI_GLASS
+		if(order == 0 && use_fresnel) {
+			/* Evaluate amount of scattering towards wo on this microfacet. */
+			float3 phase;
+			if(outside)
+				phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
+			else
+				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta);
+
+			eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
 		}
 #endif
 		if(order > 0) {
@@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
-			phase = mf_eval_phase_diffuse(wo, wm);
 #else /* MF_MULTI_GLOSSY */
-			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
 		}
@@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 			bool next_outside;
-			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+			float3 wi_prev = -wr;
+			float phase_rand = lcg_step_float_addrspace(lcg_state);
+			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 			if(!next_outside) {
 				outside = !outside;
 				wr = -wr;
 				hr = -hr;
 			}
-#elif defined(MF_MULTI_DIFFUSE)
-			wr = mf_sample_phase_diffuse(wm,
-			                             lcg_step_float_addrspace(lcg_state),
-			                             lcg_step_float_addrspace(lcg_state));
+
+			if(use_fresnel && !next_outside) {
+				throughput *= color;
+			}
+			else if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+			}
 #else /* MF_MULTI_GLOSSY */
-			wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+			if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+			}
+			wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 			lambda_r = mf_lambda(wr, alpha);
 
-			throughput *= color;
+			if(!use_fresnel)
+				throughput *= color;
 
 			C1_r = mf_C1(hr);
 			G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-	, const float eta
-#elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+	float3 wi,
+	float3 *wo,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	const float2 alpha = make_float2(alpha_x, alpha_y);
 
@@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 	float G1_r = 0.0f;
 	bool outside = true;
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
+	}
+
 	int order;
 	for(order = 0; order < 10; order++) {
 		/* Sample microfacet height. */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
 			/* The random walk has left the surface. */
 			*wo = outside? wr: -wr;
 			return throughput;
 		}
 		/* Sample microfacet normal. */
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
 
 		/* First-bounce color is already accounted for in mix weight. */
-		if(order > 0)
+		if(!use_fresnel && order > 0)
 			throughput *= color;
 
 		/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 		bool next_outside;
-		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+		float3 wi_prev = -wr;
+		float phase_rand = lcg_step_float_addrspace(lcg_state);
+		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 		if(!next_outside) {
 			hr = -hr;
 			wr = -wr;
 			outside = !outside;
 		}
-#elif defined(MF_MULTI_DIFFUSE)
-		wr = mf_sample_phase_diffuse(wm,
-		                             lcg_step_float_addrspace(lcg_state),
-		                             lcg_step_float_addrspace(lcg_state));
+
+		if(use_fresnel) {
+			if(!next_outside) {
+				throughput *= color;
+			}
+			else {
+				float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+
+				if(order == 0)
+					throughput = t_color;
+				else
+					throughput *= t_color;
+			}
+		}
 #else /* MF_MULTI_GLOSSY */
-		wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+		if(use_fresnel) {
+			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+
+			if(order == 0)
+				throughput = t_color;
+			else
+				throughput *= t_color;
+		}
+		wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 		/* Update random walk parameters. */
@@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 }
 
 #undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
 #undef MF_MULTI_GLOSSY
 #undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct OrenNayarBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float roughness;
 	float a;
 	float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct PhongRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float exponent;
 	float3 *colors;
 } PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
new file mode 100644
index 00000000000..f8ca64293b0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
+#define __BSDF_PRINCIPLED_DIFFUSE_H__
+
+/* DISNEY PRINCIPLED DIFFUSE BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledDiffuseBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float roughness;
+} PrincipledDiffuseBsdf;
+
+ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = max(dot(N, L), 0.0f);
+	float NdotV = max(dot(N, V), 0.0f);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV);
+	const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness;
+	float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV);
+
+	float value = M_1_PI_F * NdotL * Fd;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+	const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
new file mode 100644
index 00000000000..f4476bfecd0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_SHEEN_H__
+#define __BSDF_PRINCIPLED_SHEEN_H__
+
+/* DISNEY PRINCIPLED SHEEN BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledSheenBsdf {
+	SHADER_CLOSURE_BASE;
+} PrincipledSheenBsdf;
+
+ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = dot(N, L);
+	float NdotV = dot(N, V);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float value = schlick_fresnel(LdotH) * NdotL;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct ToonBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float size;
 	float smooth;
 } ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 3c2fd8004df..79ee9dc4537 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -35,10 +35,39 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_transparent_setup(ShaderClosure *sc)
+ccl_device void bsdf_transparent_setup(ShaderData *sd, const float3 weight, int path_flag)
 {
-	sc->type = CLOSURE_BSDF_TRANSPARENT_ID;
-	return SD_BSDF|SD_TRANSPARENT;
+	if(sd->flag & SD_TRANSPARENT) {
+		sd->closure_transparent_extinction += weight;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+
+			if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+				sc->weight += weight;
+				sc->sample_weight += fabsf(average(weight));
+				break;
+			}
+		}
+	}
+	else {
+		sd->flag |= SD_BSDF|SD_TRANSPARENT;
+		sd->closure_transparent_extinction = weight;
+
+		if(path_flag & PATH_RAY_TERMINATE) {
+			/* In this case the number of closures is set to zero to disable
+			 * all others, but we still want to get transparency so increase
+			 * the number just for this. */
+			sd->num_closure_left = 1;
+		}
+
+		ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
+
+		if(bsdf) {
+			bsdf->N = sd->N;
+			bsdf->type = CLOSURE_BSDF_TRANSPARENT_ID;
+		}
+	}
 }
 
 ccl_device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b0c5280b6cb..3dc15d5791c 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k
 	return(Rparl2 + Rperp2) * 0.5f;
 }
 
+ccl_device float schlick_fresnel(float u)
+{
+	float m = clamp(1.0f - u, 0.0f, 1.0f);
+	float m2 = m * m;
+	return m2 * m2 * m; // pow(m, 5)
+}
+
 ccl_device float smooth_step(float edge0, float edge1, float x)
 {
 	float result;
@@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x)
 	return result;
 }
 
+/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
+ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) {
+	/* Calculate the fresnel interpolation factor
+	 * The value from fresnel_dielectric_cos(...) has to be normalized because
+	 * the cspec0 keeps the F0 color
+	*/
+	float F0_norm = 1.0f / (1.0f - F0);
+	float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+
+	/* Blend between white and a specular color with respect to the fresnel */
+	return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd861a9..8578767b07e 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -22,12 +22,12 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct Bssrdf {
 	SHADER_CLOSURE_BASE;
 
-	float radius;
+	float3 radius;
+	float3 albedo;
 	float sharpness;
-	float d;
 	float texture_blur;
-	float albedo;
-	float3 N;
+	float roughness;
+	float channels;
 } Bssrdf;
 
 /* Planar Truncated Gaussian
@@ -39,12 +39,11 @@ typedef ccl_addr_space struct Bssrdf {
 /* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
 #define GAUSS_TRUNCATE 12.46f
 
-ccl_device float bssrdf_gaussian_eval(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_eval(const float radius, float r)
 {
 	/* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
 	 * = 1 - exp(-Rm*Rm/(2*v)) */
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f);
+	const float v = radius*radius*(0.25f*0.25f);
 	const float Rm = sqrtf(v*GAUSS_TRUNCATE);
 
 	if(r >= Rm)
@@ -53,20 +52,19 @@ ccl_device float bssrdf_gaussian_eval(const ShaderClosure *sc, float r)
 	return expf(-r*r/(2.0f*v))/(2.0f*M_PI_F*v);
 }
 
-ccl_device float bssrdf_gaussian_pdf(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
 {
 	/* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
 	const float area_truncated = 1.0f - expf(-0.5f*GAUSS_TRUNCATE);
 
-	return bssrdf_gaussian_eval(sc, r) * (1.0f/(area_truncated));
+	return bssrdf_gaussian_eval(radius, r) * (1.0f/(area_truncated));
 }
 
-ccl_device void bssrdf_gaussian_sample(const ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
 	 * r = sqrt(-2*v*logf(xi)) */
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f);
+	const float v = radius*radius*(0.25f*0.25f);
 	const float Rm = sqrtf(v*GAUSS_TRUNCATE);
 
 	/* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
@@ -87,13 +85,10 @@ ccl_device void bssrdf_gaussian_sample(const ShaderClosure *sc, float xi, float
  * far as I can tell has no closed form solution. So we get an iterative solution
  * instead with newton-raphson. */
 
-ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
 {
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float sharpness = bssrdf->sharpness;
-
 	if(sharpness == 0.0f) {
-		const float Rm = bssrdf->radius;
+		const float Rm = radius;
 
 		if(r >= Rm)
 			return 0.0f;
@@ -107,7 +102,7 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r)
 
 	}
 	else {
-		float Rm = bssrdf->radius*(1.0f + sharpness);
+		float Rm = radius*(1.0f + sharpness);
 
 		if(r >= Rm)
 			return 0.0f;
@@ -124,7 +119,7 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r)
 		else {
 			Rmy = powf(Rm, y);
 			ry = powf(r, y);
-			ryinv = (r > 0.0f)? powf(r, 2.0f*y - 2.0f): 0.0f;
+			ryinv = (r > 0.0f)? powf(r, y - 1.0f): 0.0f;
 		}
 
 		const float Rmy5 = (Rmy*Rmy) * (Rmy*Rmy) * Rmy;
@@ -135,9 +130,9 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r)
 	}
 }
 
-ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
 {
-	return bssrdf_cubic_eval(sc, r);
+	return bssrdf_cubic_eval(radius, sharpness, r);
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
@@ -168,11 +163,9 @@ ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
 	return x;
 }
 
-ccl_device void bssrdf_cubic_sample(const ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_cubic_sample(const float radius, const float sharpness, float xi, float *r, float *h)
 {
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float sharpness = bssrdf->sharpness;
-	float Rm = bssrdf->radius;
+	float Rm = radius;
 	float r_ = bssrdf_cubic_quintic_root_find(xi);
 
 	if(sharpness != 0.0f) {
@@ -207,7 +200,7 @@ ccl_device_inline float bssrdf_burley_fitting(float A)
 /* Scale mean free path length so it gives similar looking result
  * to Cubic and Gaussian models.
  */
-ccl_device_inline float bssrdf_burley_compatible_mfp(float r)
+ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
 {
 	return 0.25f * M_1_PI_F * r;
 }
@@ -215,19 +208,18 @@ ccl_device_inline float bssrdf_burley_compatible_mfp(float r)
 ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
 {
 	/* Mean free path length. */
-	const float l = bssrdf_burley_compatible_mfp(bssrdf->radius);
+	const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
 	/* Surface albedo. */
-	const float A = bssrdf->albedo;
-	const float s = bssrdf_burley_fitting(A);
-	const float d = l / s;
+	const float3 A = bssrdf->albedo;
+	const float3 s = make_float3(bssrdf_burley_fitting(A.x),
+                                 bssrdf_burley_fitting(A.y),
+                                 bssrdf_burley_fitting(A.z));
 
-	bssrdf->d = d;
+	bssrdf->radius = l / s;
 }
 
-ccl_device float bssrdf_burley_eval(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_burley_eval(const float d, float r)
 {
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float d = bssrdf->d;
 	const float Rm = BURLEY_TRUNCATE * d;
 
 	if(r >= Rm)
@@ -246,9 +238,9 @@ ccl_device float bssrdf_burley_eval(const ShaderClosure *sc, float r)
 	return (exp_r_d + exp_r_3_d) / (4.0f*d);
 }
 
-ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_burley_pdf(const float d, float r)
 {
-	return bssrdf_burley_eval(sc, r) * (1.0f/BURLEY_TRUNCATE_CDF);
+	return bssrdf_burley_eval(d, r) * (1.0f/BURLEY_TRUNCATE_CDF);
 }
 
 /* Find the radius for desired CDF value.
@@ -291,13 +283,11 @@ ccl_device_forceinline float bssrdf_burley_root_find(float xi)
 	return r;
 }
 
-ccl_device void bssrdf_burley_sample(const ShaderClosure *sc,
+ccl_device void bssrdf_burley_sample(const float d,
                                      float xi,
                                      float *r,
                                      float *h)
 {
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float d = bssrdf->d;
 	const float Rm = BURLEY_TRUNCATE * d;
 	const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
 
@@ -311,29 +301,26 @@ ccl_device void bssrdf_burley_sample(const ShaderClosure *sc,
  *
  * Samples distributed over disk with no falloff, for reference. */
 
-ccl_device float bssrdf_none_eval(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_eval(const float radius, float r)
 {
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float Rm = bssrdf->radius;
+	const float Rm = radius;
 	return (r < Rm)? 1.0f: 0.0f;
 }
 
-ccl_device float bssrdf_none_pdf(const ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_pdf(const float radius, float r)
 {
 	/* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float Rm = bssrdf->radius;
+	const float Rm = radius;
 	const float area = (M_PI_F*Rm*Rm);
 
-	return bssrdf_none_eval(sc, r) / area;
+	return bssrdf_none_eval(radius, r) / area;
 }
 
-ccl_device void bssrdf_none_sample(const ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
 	 * r = sqrt(xi)*Rm */
-	const Bssrdf *bssrdf = (const Bssrdf*)sc;
-	const float Rm = bssrdf->radius;
+	const float Rm = radius;
 	const float r_ = sqrtf(xi)*Rm;
 
 	*r = r_;
@@ -348,55 +335,166 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
 	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
 
-	if(!bssrdf)
+	if(bssrdf == NULL) {
 		return NULL;
+	}
 
 	float sample_weight = fabsf(average(weight));
 	bssrdf->sample_weight = sample_weight;
 	return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
 }
 
-ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
 {
-	if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
-		/* revert to diffuse BSDF if radius too small */
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
-		bsdf->N = bssrdf->N;
-		int flag = bsdf_diffuse_setup(bsdf);
-		bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
-		return flag;
+	int flag = 0;
+	int bssrdf_channels = 3;
+	float3 diffuse_weight = make_float3(0.0f, 0.0f, 0.0f);
+
+	/* Verify if the radii are large enough to sample without precision issues. */
+	if(bssrdf->radius.x < BSSRDF_MIN_RADIUS) {
+		diffuse_weight.x = bssrdf->weight.x;
+		bssrdf->weight.x = 0.0f;
+		bssrdf->radius.x = 0.0f;
+		bssrdf_channels--;
 	}
-	else {
+	if(bssrdf->radius.y < BSSRDF_MIN_RADIUS) {
+		diffuse_weight.y = bssrdf->weight.y;
+		bssrdf->weight.y = 0.0f;
+		bssrdf->radius.y = 0.0f;
+		bssrdf_channels--;
+	}
+	if(bssrdf->radius.z < BSSRDF_MIN_RADIUS) {
+		diffuse_weight.z = bssrdf->weight.z;
+		bssrdf->weight.z = 0.0f;
+		bssrdf->radius.z = 0.0f;
+		bssrdf_channels--;
+	}
+
+	if(bssrdf_channels < 3) {
+		/* Add diffuse BSDF if any radius too small. */
+#ifdef __PRINCIPLED__
+		if(type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
+		   type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+		{
+			float roughness = bssrdf->roughness;
+			float3 N = bssrdf->N;
+
+			PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diffuse_weight);
+
+			if(bsdf) {
+				bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+				bsdf->N = N;
+				bsdf->roughness = roughness;
+				flag |= bsdf_principled_diffuse_setup(bsdf);
+			}
+		}
+		else
+#endif  /* __PRINCIPLED__ */
+		{
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), diffuse_weight);
+
+			if(bsdf) {
+				bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+				bsdf->N = bssrdf->N;
+				flag |= bsdf_diffuse_setup(bsdf);
+			}
+		}
+	}
+
+	/* Setup BSSRDF if radius is large enough. */
+	if(bssrdf_channels > 0) {
+		bssrdf->type = type;
+		bssrdf->channels = bssrdf_channels;
+		bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
 		bssrdf->texture_blur = saturate(bssrdf->texture_blur);
 		bssrdf->sharpness = saturate(bssrdf->sharpness);
-		bssrdf->type = type;
 
-		if(type == CLOSURE_BSSRDF_BURLEY_ID) {
+		if(type == CLOSURE_BSSRDF_BURLEY_ID ||
+		   type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
+		   type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
+		   type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+		{
 			bssrdf_burley_setup(bssrdf);
 		}
 
-		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
+		flag |= SD_BSSRDF;
+	}
+	else {
+		bssrdf->type = type;
+		bssrdf->sample_weight = 0.0f;
 	}
+
+	return flag;
 }
 
 ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
 {
-	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
-		bssrdf_cubic_sample(sc, xi, r, h);
-	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
-		bssrdf_gaussian_sample(sc, xi, r, h);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
-		bssrdf_burley_sample(sc, xi, r, h);
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	float radius;
+
+	/* Sample color channel and reuse random number. Only a subset of channels
+	 * may be used if their radius was too small to handle as BSSRDF. */
+	xi *= bssrdf->channels;
+
+	if(xi < 1.0f) {
+		radius = (bssrdf->radius.x > 0.0f)? bssrdf->radius.x:
+		         (bssrdf->radius.y > 0.0f)? bssrdf->radius.y:
+		                                    bssrdf->radius.z;
+	}
+	else if(xi < 2.0f) {
+		xi -= 1.0f;
+		radius = (bssrdf->radius.x > 0.0f)? bssrdf->radius.y:
+		                                    bssrdf->radius.z;
+	}
+	else {
+		xi -= 2.0f;
+		radius = bssrdf->radius.z;
+	}
+
+	/* Sample BSSRDF. */
+	if(bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
+		bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
+	}
+	else if(bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID){
+		bssrdf_gaussian_sample(radius, xi, r, h);
+	}
+	else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
+		bssrdf_burley_sample(radius, xi, r, h);
+	}
+}
+
+ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
+{
+	if(radius == 0.0f) {
+		return 0.0f;
+	}
+	else if(bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
+		return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
+	}
+	else if(bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
+		return bssrdf_gaussian_pdf(radius, r);
+	}
+	else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
+		return bssrdf_burley_pdf(radius, r);
+	}
+}
+
+ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
+{
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+
+	return make_float3(
+		bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
+		bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
+		bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
 }
 
 ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
-	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
-		return bssrdf_cubic_pdf(sc, r);
-	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
-		return bssrdf_gaussian_pdf(sc, r);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
-		return bssrdf_burley_pdf(sc, r);
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	float3 pdf = bssrdf_eval(sc, r);
+
+	return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index c534df373bd..e709ca9a372 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -32,8 +32,32 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* BACKGROUND CLOSURE */
+
+ccl_device void background_setup(ShaderData *sd, const float3 weight)
+{
+	if(sd->flag & SD_EMISSION) {
+		sd->closure_emission_background += weight;
+	}
+	else {
+		sd->flag |= SD_EMISSION;
+		sd->closure_emission_background = weight;
+	}
+}
+
 /* EMISSION CLOSURE */
 
+ccl_device void emission_setup(ShaderData *sd, const float3 weight)
+{
+	if(sd->flag & SD_EMISSION) {
+		sd->closure_emission_background += weight;
+	}
+	else {
+		sd->flag |= SD_EMISSION;
+		sd->closure_emission_background = weight;
+	}
+}
+
 /* return the probability distribution function in the direction I,
  * given the parameters and the light's surface normal.  This MUST match
  * the PDF computed by sample(). */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 01e67c7c2fd..da791e9aa73 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -19,14 +19,27 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* VOLUME EXTINCTION */
+
+ccl_device void volume_extinction_setup(ShaderData *sd, float3 weight)
+{
+	if(sd->flag & SD_EXTINCTION) {
+		sd->closure_transparent_extinction += weight;
+	}
+	else {
+		sd->flag |= SD_EXTINCTION;
+		sd->closure_transparent_extinction = weight;
+	}
+}
+
+/* HENYEY-GREENSTEIN CLOSURE */
+
 typedef ccl_addr_space struct HenyeyGreensteinVolume {
 	SHADER_CLOSURE_BASE;
 
 	float g;
 } HenyeyGreensteinVolume;
 
-/* HENYEY-GREENSTEIN CLOSURE */
-
 /* Given cosine between rays, return probability density that a photon bounces
  * to that direction. The g parameter controls how different it is from the
  * uniform sphere. g=0 uniform diffuse-like, g=1 close to sharp single ray. */
@@ -70,35 +83,45 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, c
 	return make_float3(*pdf, *pdf, *pdf);
 }
 
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
-	float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device float3 henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pdf)
 {
-	const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc;
-	float g = volume->g;
-	float cos_phi, sin_phi, cos_theta;
-
 	/* match pdf for small g */
-	if(fabsf(g) < 1e-3f) {
+	float cos_theta;
+	bool isotropic = fabsf(g) < 1e-3f;
+
+	if(isotropic) {
 		cos_theta = (1.0f - 2.0f * randu);
-		*pdf = M_1_PI_F * 0.25f;
+		if(pdf) {
+			*pdf = M_1_PI_F * 0.25f;
+		}
 	}
 	else {
 		float k = (1.0f - g * g) / (1.0f - g + 2.0f * g * randu);
 		cos_theta = (1.0f + g * g - k * k) / (2.0f * g);
-		*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+		if(pdf) {
+			*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+		}
 	}
 
 	float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
-
 	float phi = M_2PI_F * randv;
-	cos_phi = cosf(phi);
-	sin_phi = sinf(phi);
+	float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
 
-	/* note that I points towards the viewer and so is used negated */
 	float3 T, B;
-	make_orthonormals(-I, &T, &B);
-	*omega_in = sin_theta * cos_phi * T + sin_theta * sin_phi * B + cos_theta * (-I);
+	make_orthonormals(D, &T, &B);
+	dir = dir.x * T + dir.y * B + dir.z * D;
+
+	return dir;
+}
+
+ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+{
+	const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc;
+	float g = volume->g;
 
+	/* note that I points towards the viewer and so is used negated */
+	*omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
 	*eval = make_float3(*pdf, *pdf, *pdf); /* perfect importance sampling */
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -110,15 +133,6 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I
 	return LABEL_VOLUME_SCATTER;
 }
 
-/* ABSORPTION VOLUME CLOSURE */
-
-ccl_device int volume_absorption_setup(ShaderClosure *sc)
-{
-	sc->type = CLOSURE_VOLUME_ABSORPTION_ID;
-
-	return SD_ABSORPTION;
-}
-
 /* VOLUME CLOSURE */
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf)
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE      (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+	int offsets[9];
+	int strides[9];
+	int x[4];
+	int y[4];
+	/* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+	ccl_global float *buffers[9];
+#else
+	long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW     } \
+                                 pixel_buffer += buffer_w - (high.x - low.x); \
+                             }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+                                           const ccl_global float *ccl_restrict buffer,
+                                           float *features,
+                                           const float *ccl_restrict mean,
+                                           int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = fabsf(ccl_get_feature(buffer, 0));
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] -= mean[i];
+	}
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+                                                 const ccl_global float *ccl_restrict buffer,
+                                                 float *scales,
+                                                 const float *ccl_restrict mean,
+                                                 int pass_stride)
+{
+	scales[0] = fabsf(pixel.x - mean[0]);
+	scales[1] = fabsf(pixel.y - mean[1]);
+	scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+	scale[0] = 1.0f/max(scale[0], 0.01f);
+	scale[1] = 1.0f/max(scale[1], 0.01f);
+	scale[2] = 1.0f/max(scale[2], 0.01f);
+	scale[6] = 1.0f/max(scale[4], 0.01f);
+	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+                                          int pass_stride)
+{
+	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+                                      int rank,
+                                      const ccl_global float *ccl_restrict transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       const ccl_global float *ccl_restrict p_buffer,
+                                                       int2 q_pixel,
+                                                       const ccl_global float *ccl_restrict q_buffer,
+                                                       int pass_stride,
+                                                       int rank,
+                                                       float *design_row,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       int stride)
+{
+	design_row[0] = 1.0f;
+	math_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3ddd8712266
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     float4 y4 = make_float4(pixel.y); \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
+
+#define END_FOR_PIXEL_WINDOW_SSE     } \
+                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+                                 }
+
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
+                                               const float *ccl_restrict buffer,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
+                                               int pass_stride)
+{
+	features[0] = x;
+	features[1] = y;
+	features[2] = fabs(ccl_get_feature_sse(0));
+	features[3] = ccl_get_feature_sse(1);
+	features[4] = ccl_get_feature_sse(2);
+	features[5] = ccl_get_feature_sse(3);
+	features[6] = ccl_get_feature_sse(4);
+	features[7] = ccl_get_feature_sse(5);
+	features[8] = ccl_get_feature_sse(6);
+	features[9] = ccl_get_feature_sse(7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] = features[i] - mean[i];
+	}
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		features[i] = mask(active_pixels, features[i]);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
+                                                     const float *ccl_restrict buffer,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
+                                                     int pass_stride)
+{
+	scales[0] = fabs(x - mean[0]);
+	scales[1] = fabs(y - mean[1]);
+	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+	            sqr(ccl_get_feature_sse(2) - mean[4]) +
+	            sqr(ccl_get_feature_sse(3) - mean[5]);
+	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+	            sqr(ccl_get_feature_sse(6) - mean[8]) +
+	            sqr(ccl_get_feature_sse(7) - mean[9]);
+	for(int i = 0; i < 6; i++)
+		scales[i] = mask(active_pixels, scales[i]);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+{
+	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+#  include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+#  include "kernel/filter/filter_transform_gpu.h"
+#else
+#  ifdef __KERNEL_SSE3__
+#    include "kernel/filter/filter_transform_sse.h"
+#  else
+#    include "kernel/filter/filter_transform.h"
+#  endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+#  include "kernel/filter/filter_nlm_cpu.h"
+#else
+#  include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..e2da0fd872b
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+                                                         const float *ccl_restrict weight_image,
+                                                         const float *ccl_restrict variance_image,
+                                                         float *difference_image,
+                                                         int4 rect,
+                                                         int stride,
+                                                         int channel_offset,
+                                                         float a,
+                                                         float k_2)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			float diff = 0.0f;
+			int numChannels = channel_offset? 3 : 1;
+			for(int c = 0; c < numChannels; c++) {
+				float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+				float pvar = variance_image[c*channel_offset + y*stride + x];
+				float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+			}
+			if(numChannels > 1) {
+				diff *= 1.0f/numChannels;
+			}
+			difference_image[y*stride + x] = diff;
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+                                              float *out_image,
+                                              int4 rect,
+                                              int stride,
+                                              int f)
+{
+	int aligned_lowx = rect.x / 4;
+	int aligned_highx = (rect.z + 3) / 4;
+	for(int y = rect.y; y < rect.w; y++) {
+		const int low = max(rect.y, y-f);
+		const int high = min(rect.w, y+f+1);
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*stride + x] = 0.0f;
+		}
+		for(int y1 = low; y1 < high; y1++) {
+			float4* out_image4 = (float4*)(out_image + y*stride);
+			float4* difference_image4 = (float4*)(difference_image + y1*stride);
+			for(int x = aligned_lowx; x < aligned_highx; x++) {
+				out_image4[x] += difference_image4[x];
+			}
+		}
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*stride + x] *= 1.0f/(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*stride + x] = 0.0f;
+		}
+	}
+	for(int dx = -f; dx <= f; dx++) {
+		int pos_dx = max(0, dx);
+		int neg_dx = min(0, dx);
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+				out_image[y*stride + x] += difference_image[y*stride + x+dx];
+			}
+		}
+	}
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+                                                       const float *ccl_restrict difference_image,
+                                                       const float *ccl_restrict image,
+                                                       float *out_image,
+                                                       float *accum_image,
+                                                       int4 rect,
+                                                       int stride,
+                                                       int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*stride + x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+			accum_image[y*stride + x] += weight;
+			out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+                                                           const float *ccl_restrict difference_image,
+                                                           const float *ccl_restrict buffer,
+                                                           float *transform,
+                                                           int *rank,
+                                                           float *XtWX,
+                                                           float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_window,
+                                                           int stride, int f,
+                                                           int pass_stride)
+{
+	int4 clip_area = rect_clip(rect, filter_window);
+	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+	for(int y = clip_area.y; y < clip_area.w; y++) {
+		for(int x = clip_area.x; x < clip_area.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*stride + x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+
+			int storage_ofs = coord_to_local_index(filter_window, x, y);
+			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+			int    *l_rank = rank + storage_ofs;
+
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy,
+			                                stride,
+			                                pass_stride,
+			                                buffer,
+			                                l_transform, l_rank,
+			                                weight, l_XtWX, l_XtWY, 0);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+                                                   const float *ccl_restrict accum_image,
+                                                   int4 rect,
+                                                   int w)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] /= accum_image[y*w+x];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..4ca49ea6733
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines pixel coordinates and offset for the current thread.
+ * Returns whether the thread should do any work.
+ *
+ * All coordinates are relative to the denoising buffer!
+ *
+ * Window is the rect that should be processed.
+ * co is filled with (x, y, dx, dy).
+ */
+ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
+                                             int4 *rect, int4 *co, int *ofs,
+                                             int4 window)
+{
+	/* Determine the pixel offset that this thread should apply. */
+	int s = 2*r+1;
+	int si = ccl_global_id(1);
+	int sx = si % s;
+	int sy = si / s;
+	if(sy >= s) {
+		return false;
+	}
+	co->z = sx-r;
+	co->w = sy-r;
+
+	/* Pixels still need to lie inside the denoising buffer after applying the offset,
+	 * so determine the area for which this is the case. */
+	*rect = make_int4(max(0, -co->z),     max(0, -co->w),
+	              w - max(0,  co->z), h - max(0,  co->w));
+
+	/* Find the intersection of the area that we want to process (window) and the area
+	 * that can be processed (rect) to get the final area for this offset. */
+	int4 clip_area = rect_clip(window, *rect);
+
+	/* If the radius is larger than one of the sides of the window,
+	 * there will be shifts for which there is no usable pixel at all. */
+	if(!rect_is_valid(clip_area)) {
+		return false;
+	}
+
+	/* Map the linear thread index to pixels inside the clip area. */
+	int x, y;
+	if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+		return false;
+	}
+	co->x = x;
+	co->y = y;
+
+	*ofs = (sy*s + sx) * stride;
+
+	return true;
+}
+
+ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
+                                      int4 *rect, int4 *co, int *ofs)
+{
+	return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         const ccl_global float *ccl_restrict weight_image,
+                                                         const ccl_global float *ccl_restrict variance_image,
+                                                         ccl_global float *difference_image,
+                                                         int4 rect, int stride,
+                                                         int channel_offset,
+                                                         float a, float k_2)
+{
+	float diff = 0.0f;
+	int numChannels = channel_offset? 3 : 1;
+	for(int c = 0; c < numChannels; c++) {
+		float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+		float pvar = variance_image[c*channel_offset + y*stride + x];
+		float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+	}
+	if(numChannels > 1) {
+		diff *= 1.0f/numChannels;
+	}
+	difference_image[y*stride + x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              const ccl_global float *ccl_restrict difference_image,
+                                              ccl_global float *out_image,
+                                              int4 rect, int stride, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.y, y-f);
+	const int high = min(rect.w, y+f+1);
+	for(int y1 = low; y1 < high; y1++) {
+		sum += difference_image[y1*stride + x];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*stride + x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     const ccl_global float *ccl_restrict difference_image,
+                                                     ccl_global float *out_image,
+                                                     int4 rect, int stride, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*stride + x1];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+                                                       int dx, int dy,
+                                                       const ccl_global float *ccl_restrict difference_image,
+                                                       const ccl_global float *ccl_restrict image,
+                                                       ccl_global float *out_image,
+                                                       ccl_global float *accum_image,
+                                                       int4 rect, int stride, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*stride + x1];
+	}
+	sum *= 1.0f/(high-low);
+	if(out_image) {
+		atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
+		atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
+	}
+	else {
+		accum_image[y*stride + x] = sum;
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
+                                                           int dx, int dy,
+                                                           const ccl_global float *ccl_restrict difference_image,
+                                                           const ccl_global float *ccl_restrict buffer,
+                                                           const ccl_global float *ccl_restrict transform,
+                                                           ccl_global int *rank,
+                                                           ccl_global float *XtWX,
+                                                           ccl_global float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_window,
+                                                           int stride, int f,
+                                                           int pass_stride,
+                                                           int localIdx)
+{
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	float sum = 0.0f;
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*stride + x1];
+	}
+	float weight = sum * (1.0f/(high - low));
+
+	/* Reconstruction data is only stored for pixels inside the filter window,
+	 * so compute the pixels's index in there. */
+	int storage_ofs = coord_to_local_index(filter_window, x, y);
+	transform += storage_ofs;
+	rank += storage_ofs;
+	XtWX += storage_ofs;
+	XtWY += storage_ofs;
+
+	kernel_filter_construct_gramian(x, y,
+	                                rect_size(filter_window),
+	                                dx, dy,
+	                                stride,
+	                                pass_stride,
+	                                buffer,
+	                                transform, rank,
+	                                weight, XtWX, XtWY,
+	                                localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+                                                   ccl_global float *out_image,
+                                                   const ccl_global float *ccl_restrict accum_image,
+                                                   int stride)
+{
+	out_image[y*stride + x] /= accum_image[y*stride + x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..4af209341f6
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int x, int y,
+                                            ccl_global float *unfilteredA,
+                                            ccl_global float *unfilteredB,
+                                            ccl_global float *sampleVariance,
+                                            ccl_global float *sampleVarianceV,
+                                            ccl_global float *bufferVariance,
+                                            int4 rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+
+	int offset = tiles->offsets[tile];
+	int stride = tiles->strides[tile];
+	const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+	center_buffer += buffer_denoising_offset + 14;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+	float varA = center_buffer[2];
+	float varB = center_buffer[5];
+	int odd_sample = (sample+1)/2;
+	int even_sample = sample/2;
+
+	/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+	 * update does not work efficiently with atomics in the kernel. */
+	varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+	varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+
+	varA /= max(odd_sample - 1, 1);
+	varB /= max(even_sample - 1, 1);
+
+	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
+	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+                                          ccl_global TilesInfo *tiles,
+                                          int m_offset, int v_offset,
+                                          int x, int y,
+                                          ccl_global float *mean,
+                                          ccl_global float *variance,
+                                          int4 rect, int buffer_pass_stride,
+                                          int buffer_denoising_offset)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+	ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	mean[idx] = center_buffer[m_offset] / sample;
+	if(sample > 1) {
+		/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+		 * update does not work efficiently with atomics in the kernel. */
+		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	}
+	else {
+		/* Can't compute variance with single sample, just set it very high. */
+		variance[idx] = 1e10f;
+	}
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+                                              ccl_global float *image,
+                                              ccl_global float *variance,
+                                              ccl_global float *depth,
+                                              ccl_global float *out,
+                                              int4 rect,
+                                              int pass_stride)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	int n = 0;
+	float values[25];
+	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+			float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
+			color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+			float L = average(color);
+
+			/* Find the position of L. */
+			int i;
+			for(i = 0; i < n; i++) {
+				if(values[i] > L) break;
+			}
+			/* Make space for L by shifting all following values to the right. */
+			for(int j = n; j > i; j--) {
+				values[j] = values[j-1];
+			}
+			/* Insert L. */
+			values[i] = L;
+			n++;
+		}
+	}
+
+	int idx = (y-rect.y)*buffer_w + (x-rect.x);
+	float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
+	color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+	float L = average(color);
+
+	float ref = 2.0f*values[(int)(n*0.75f)];
+	if(L > ref) {
+		/* The pixel appears to be an outlier.
+		 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+		 * should actually be at the reference value:
+		 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+		 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+		 */
+		float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+		if(L - 3*stddev < ref) {
+			/* The pixel is an outlier, so negate the depth value to mark it as one.
+			 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+			depth[idx] = -depth[idx];
+			float fac = ref/L;
+			color *= fac;
+			variance[idx              ] *= fac*fac;
+			variance[idx + pass_stride] *= fac*fac;
+			variance[idx+2*pass_stride] *= fac*fac;
+		}
+	}
+	out[idx              ] = color.x;
+	out[idx + pass_stride] = color.y;
+	out[idx+2*pass_stride] = color.z;
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+                                             ccl_global float *mean,
+                                             ccl_global float *variance,
+                                             ccl_global float *a,
+                                             ccl_global float *b,
+                                             int4 rect, int r)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..b7bf322f9ce
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+                                                       int storage_stride,
+                                                       int dx, int dy,
+                                                       int buffer_stride,
+                                                       int pass_stride,
+                                                       const ccl_global float *ccl_restrict buffer,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       ccl_global int *rank,
+                                                       float weight,
+                                                       ccl_global float *XtWX,
+                                                       ccl_global float3 *XtWY,
+                                                       int localIdx)
+{
+	if(weight < 1e-3f) {
+		return;
+	}
+
+	int p_offset =  y     * buffer_stride +  x;
+	int q_offset = (y+dy) * buffer_stride + (x+dx);
+
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	float design_row[DENOISE_FEATURES+1];
+#endif
+
+	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+		return;
+	}
+
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
+
+	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y,
+                                              ccl_global float *buffer,
+                                              ccl_global int *rank,
+                                              int storage_stride,
+                                              ccl_global float *XtWX,
+                                              ccl_global float3 *XtWY,
+                                              int4 buffer_params,
+                                              int sample)
+{
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+	if(XtWX[0] < 1e-3f) {
+		/* There is not enough information to determine a denoised result.
+		 * As a fallback, keep the original value of the pixel. */
+		 return;
+	}
+
+	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
+	 * In case the solution of the linear model fails due to numerical issues,
+	 * fall back to this value. */
+	float3 mean_color = XtWY[0]/XtWX[0];
+
+	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+	float3 final_color = XtWY[0];
+	if(!isfinite3_safe(final_color)) {
+		final_color = mean_color;
+	}
+
+	/* Clamp pixel value to positive values. */
+	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+	final_color *= sample;
+	if(buffer_params.w) {
+		final_color.x += combined_buffer[buffer_params.w+0];
+		final_color.y += combined_buffer[buffer_params.w+1];
+		final_color.z += combined_buffer[buffer_params.w+2];
+	}
+	combined_buffer[0] = final_color.x;
+	combined_buffer[1] = final_color.y;
+	combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float features[DENOISE_FEATURES];
+
+	/* Temporary storage, used in different steps of the algorithm. */
+	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	float tempvector[2*DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float *feature_scale = tempvector;
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float* feature_matrix = tempmatrix;
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < (*rank); i++) {
+		math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  ccl_global float *transform,
+                                                  ccl_global int *rank,
+                                                  int radius, float pca_threshold,
+                                                  int transform_stride, int localIdx)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+	const ccl_global float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		for(int j = 0; j < (*rank); j++) {
+			transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..9e65f61664b
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float4 features[DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	float4 feature_means[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float4 pixel_scale = make_float4(1.0f / num_pixels);
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
+	}
+
+	float4 feature_scale[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	filter_calculate_scale_sse(feature_scale);
+
+	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 6838e26c242..f34b77ebc07 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,19 +14,20 @@
  * limitations under the License.
  */
 
-#include "geom_attribute.h"
-#include "geom_object.h"
+#include "kernel/geom/geom_attribute.h"
+#include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
-#  include "geom_patch.h"
+#  include "kernel/geom/geom_patch.h"
 #endif
-#include "geom_triangle.h"
-#include "geom_subd_triangle.h"
-#include "geom_triangle_intersect.h"
-#include "geom_motion_triangle.h"
-#include "geom_motion_triangle_intersect.h"
-#include "geom_motion_triangle_shader.h"
-#include "geom_motion_curve.h"
-#include "geom_curve.h"
-#include "geom_volume.h"
-#include "geom_primitive.h"
+#include "kernel/geom/geom_triangle.h"
+#include "kernel/geom/geom_subd_triangle.h"
+#include "kernel/geom/geom_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle.h"
+#include "kernel/geom/geom_motion_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle_shader.h"
+#include "kernel/geom/geom_motion_curve.h"
+#include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_curve_intersect.h"
+#include "kernel/geom/geom_volume.h"
+#include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..42c053704d5 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -51,14 +51,19 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 /* Find attribute based on ID */
 
+ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+{
+	return kernel_tex_fetch(__objects, object).attribute_map_offset;
+}
+
 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE) {
+	if(sd->object == OBJECT_NONE) {
 		return attribute_not_found();
 	}
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = object_attribute_map_offset(kg, sd->object);
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +78,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE &&
+	if(sd->prim == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
@@ -98,7 +103,6 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderD
 	tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
 	tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
 	tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
-	tfm.w = kernel_tex_fetch(__attributes_float3, desc.offset + 3);
 
 	return tfm;
 }
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 712b67a1b55..e35267f02bf 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -16,9 +16,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
  *
- * Curve primitive for rendering hair and fur. These can be render as flat ribbons
- * or curves with actual thickness. The curve can also be rendered as line segments
- * rather than curves for better performance */
+ * Curve primitive for rendering hair and fur. These can be render as flat
+ * ribbons or curves with actual thickness. The curve can also be rendered as
+ * line segments rather than curves for better performance.
+ */
 
 #ifdef __HAIR__
 
@@ -32,22 +33,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +72,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +105,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +131,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,23 +140,23 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
 
 /* Curve tangent normal */
 
 ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
+{
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+		tgN = normalize(tgN - gd * sd->dPdu);
 #endif
 	}
 
@@ -213,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
 	}
 }
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
-{
-	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
-#endif
-{
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	ssef vdir = load4f(dir);
-	ssef vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
-	{
-		ssef dtmp = vdir * vdir;
-		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-		ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#ifdef __KERNEL_AVX2__
-		avxf P_curve_0_1, P_curve_2_3;
-		if(is_curve_primitive) {
-			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
-		}
-#else  /* __KERNEL_AVX2__ */
-		ssef P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
-		}
-#endif  /* __KERNEL_AVX2__ */
-
-		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#ifdef __KERNEL_AVX2__
-		const avxf vPP = _mm256_broadcast_ps(&P.m128);
-		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_0_1 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_2_3 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
-
-		const ssef p0 = _mm256_castps256_ps128(p01);
-		const ssef p1 = _mm256_extractf128_ps(p01, 1);
-		const ssef p2 = _mm256_castps256_ps128(p23);
-		const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-		r_st = ((float4 &)P_curve_1).w;
-		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-		r_en = ((float4 &)P_curve_2).w;
-#else  /* __KERNEL_AVX2__ */
-		ssef htfm[] = { htfm0, htfm1, htfm2 };
-		ssef vP = load4f(P);
-		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
-#endif  /* __KERNEL_AVX2__ */
-
-		float fc = 0.71f;
-		ssef vfc = ssef(fc);
-		ssef vfcxp3 = vfc * p3;
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = vfc * (p2 - p0);
-		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
-
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P_curve[1].w;
-		r_en = P_curve[2].w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		const float i_st = tree * resol;
-		const float i_en = i_st + (level * resol);
-
-#ifdef __KERNEL_SSE2__
-		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		ssef vbmin = min(vp_st, vp_en);
-		ssef vbmax = max(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if(level == 1) {
-
-			/* the maximum recursion depth is reached.
-			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			 * dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			float gd = 0.0f;
-
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-#ifdef __KERNEL_SSE__
-				const float3 tg_sq = tg * tg;
-				float w = tg_sq.x + tg_sq.y;
-#else
-				float w = tg.x * tg.x + tg.y * tg.y;
-#endif
-				if(w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-#ifdef __KERNEL_SSE__
-				const float3 p_sttg = p_st * tg;
-				w = -(p_sttg.x + p_sttg.y) / w;
-#else
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#endif
-				w = saturate(w);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-#ifdef __KERNEL_SSE__
-					const float3 p_curr_sq = p_curr * p_curr;
-					const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
-					float d = dxxx.x;
-#else
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#endif
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					float inv_mw_extension = 1.0f/mw_extension;
-					if(d0 >= 0)
-						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
-				}
-				
-				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float invl = 1.0f/l;
-				float3 tg = (p_en - p_st) * invl;
-				gd = (or2 - or1) * invl;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float invcyla = 1.0f/cyla;
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb*invcyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if(td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				
-				float rootd = sqrtf(td);
-				float correction = (-tb - rootd) * 0.5f * invcyla;
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = (-tb + rootd) * 0.5f * invcyla;
-					t = tcentre + correction;
-				}			
-
-				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction)) * invl;
-				w = saturate(w);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					r_curr = r1 + (r2 - r1) * w;
-					r_ext = or1 + (or2 - or1) * w;
-					coverage = r_curr/r_ext;
-
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			/* we found a new intersection */
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = u;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-				hit = true;
-			}
-			
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-{
-	/* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-#  define len3_squared(x) len_squared(x)
-#  define len3(x) len(x)
-#  define dot3(x, y) dot(x, y)
-#endif
-
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
-	float4 P_curve[2];
-
-	if(is_curve_primitive) {
-		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-	}
-
-	float or1 = P_curve[0].w;
-	float or2 = P_curve[1].w;
-	float3 p1 = float4_to_float3(P_curve[0]);
-	float3 p2 = float4_to_float3(P_curve[1]);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	float3 dif = P - p1;
-	float3 dif_second = P - p2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len3(dif) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len3(dif_second) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float3 p21_diff = p2 - p1;
-	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-	float3 dir = direction;
-	float sphere_b_tmp = dot3(dir, sphere_dif1);
-	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
-	ssef P_curve[2];
-	
-	if(is_curve_primitive) {
-		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
-	}
-
-	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
-	ssef r12 = or12;
-	const ssef vP = load4f(P);
-	const ssef dif = vP - P_curve[0];
-	const ssef dif_second = vP - P_curve[1];
-	if(difl != 0.0f) {
-		const ssef len1_sq = len3_squared_splat(dif);
-		const ssef len2_sq = len3_squared_splat(dif_second);
-		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
-		r12 = max(or12, pixelsize12);
-	}
-	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
-	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-	const ssef p21_diff = P_curve[1] - P_curve[0];
-	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-	const ssef dir = load4f(direction);
-	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
-	float mr = max(r1, r2);
-	float l = len3(p21_diff);
-	float invl = 1.0f / l;
-	float sp_r = mr + 0.5f * l;
-
-	float sphere_b = dot3(dir, sphere_dif2);
-	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
-	float3 tg = p21_diff * invl;
-#else
-	const ssef tg = p21_diff * invl;
-#endif
-	float gd = (r2 - r1) * invl;
-
-	float dirz = dot3(dir, tg);
-	float difz = dot3(dif, tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
-	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-#ifndef __KERNEL_SSE2__
-	float3 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross(tg, dif));
-#else
-	const ssef cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
-	float cprodsq = len3_squared(cprod);
-	float distscaled = dot3(cprod, dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-#ifndef __KERNEL_SSE2__
-	float3 tdif = dif + tcentre * dir;
-#else
-	const ssef tdif = madd(ssef(tcentre), dir, dif);
-#endif
-	float tdifz = dot3(tdif, tg);
-	float tdifma = tdifz*gd + r1;
-	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
-	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
-	float td = tb*tb - 4*a*tc;
-
-	if(td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		// bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			// backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) * invl;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if(flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-
-				return true;
-			}
-		}
-	}
-
-	return false;
-
-#ifndef __KERNEL_SSE2__
-#  undef len3_squared
-#  undef len3
-#  undef dot3
-#endif
-}
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
-	int k1 = k0 + 1;
-
-	float3 tg;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p[4];
-		p[0] = float4_to_float3(P_curve[0]);
-		p[1] = float4_to_float3(P_curve[1]);
-		p[2] = float4_to_float3(P_curve[2]);
-		p[3] = float4_to_float3(P_curve[3]);
-
-		P = P + D*t;
-
-#ifdef __UV__
-		ccl_fetch(sd, u) = isect->u;
-		ccl_fetch(sd, v) = 0.0f;
-#endif
-
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
-		}
-		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			ccl_fetch(sd, Ng) = normalize(P - p_curr);
-
-			/* adjustment for changing radius */
-			float gd = isect->v;
-
-			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-			}
-		}
-
-		/* todo: sometimes the normal is still so that this is detected as
-		 * backfacing even if cull backfaces is enabled */
-
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-	}
-	else {
-		float4 P_curve[2];
-
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
-			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
-		}
-		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
-		}
-
-		float l = 1.0f;
-		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-		
-		P = P + D*t;
-
-		float3 dif = P - float4_to_float3(P_curve[0]);
-
-#ifdef __UV__
-		ccl_fetch(sd, u) = dot(dif,tg)/l;
-		ccl_fetch(sd, v) = 0.0f;
-#endif
-
-		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
-			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-		}
-		else {
-			float gd = isect->v;
-
-			/* direction from inside to surface of curve */
-			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
-
-			/* adjustment for changing radius */
-			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-			}
-		}
-
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = tg;
-	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
-#endif
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-
-#endif
+#endif  /* __HAIR__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
new file mode 100644
index 00000000000..46c3f408f0b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -0,0 +1,927 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve primitive intersection functions. */
+
+#ifdef __HAIR__
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+{
+	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+}
+#endif
+
+/* On CPU pass P and dir by reference to aligned vector. */
+ccl_device_forceinline bool cardinal_curve_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        const float3 ccl_ref P,
+        const float3 ccl_ref dir,
+        uint visibility,
+        int object,
+        int curveAddr,
+        float time,
+        int type,
+        uint *lcg_state,
+        float difl,
+        float extmax)
+{
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	ssef vdir = load4f(dir);
+	ssef vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+
+	{
+		ssef dtmp = vdir * vdir;
+		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+		ssef rd_ss = load1f_first(1.0f) / d_ss;
+
+		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		avxf P_curve_0_1, P_curve_2_3;
+		if(is_curve_primitive) {
+			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+		}
+#else  /* __KERNEL_AVX2__ */
+		ssef P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+		}
+#endif  /* __KERNEL_AVX2__ */
+
+		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+
+		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		const avxf vPP = _mm256_broadcast_ps(&P.m128);
+		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_0_1 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_2_3 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+		const ssef p0 = _mm256_castps256_ps128(p01);
+		const ssef p1 = _mm256_extractf128_ps(p01, 1);
+		const ssef p2 = _mm256_castps256_ps128(p23);
+		const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+		r_st = ((float4 &)P_curve_1).w;
+		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+		r_en = ((float4 &)P_curve_2).w;
+#else  /* __KERNEL_AVX2__ */
+		ssef htfm[] = { htfm0, htfm1, htfm2 };
+		ssef vP = load4f(P);
+		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
+
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+#endif  /* __KERNEL_AVX2__ */
+
+		float fc = 0.71f;
+		ssef vfc = ssef(fc);
+		ssef vfcxp3 = vfc * p3;
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = vfc * (p2 - p0);
+		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
+
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P_curve[1].w;
+		r_en = P_curve[2].w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		const float i_st = tree * resol;
+		const float i_en = i_st + (level * resol);
+
+#ifdef __KERNEL_SSE2__
+		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		ssef vbmin = min(vp_st, vp_en);
+		ssef vbmax = max(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if(level == 1) {
+
+			/* the maximum recursion depth is reached.
+			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			 * dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			float gd = 0.0f;
+
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+				const float3 tg_sq = tg * tg;
+				float w = tg_sq.x + tg_sq.y;
+#else
+				float w = tg.x * tg.x + tg.y * tg.y;
+#endif
+				if(w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+#ifdef __KERNEL_SSE__
+				const float3 p_sttg = p_st * tg;
+				w = -(p_sttg.x + p_sttg.y) / w;
+#else
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
+				w = saturate(w);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+					const float3 p_curr_sq = p_curr * p_curr;
+					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
+					float d = dxxx.x;
+#else
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					float inv_mw_extension = 1.0f/mw_extension;
+					if(d0 >= 0)
+						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+				}
+
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float invl = 1.0f/l;
+				float3 tg = (p_en - p_st) * invl;
+				gd = (or2 - or1) * invl;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float invcyla = 1.0f/cyla;
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb*invcyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if(td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float rootd = sqrtf(td);
+				float correction = (-tb - rootd) * 0.5f * invcyla;
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = (-tb + rootd) * 0.5f * invcyla;
+					t = tcentre + correction;
+				}
+
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction)) * invl;
+				w = saturate(w);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					r_curr = r1 + (r2 - r1) * w;
+					r_ext = or1 + (or2 - or1) * w;
+					coverage = r_curr/r_ext;
+
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			/* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				hit = true;
+			}
+
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+                                            Intersection *isect,
+                                            float3 P,
+                                            float3 direction,
+                                            uint visibility,
+                                            int object,
+                                            int curveAddr,
+                                            float time,
+                                            int type,
+                                            uint *lcg_state,
+                                            float difl,
+                                            float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#  define len3_squared(x) len_squared(x)
+#  define len3(x) len(x)
+#  define dot3(x, y) dot(x, y)
+#endif
+
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+	}
+
+	float or1 = P_curve[0].w;
+	float or2 = P_curve[1].w;
+	float3 p1 = float4_to_float3(P_curve[0]);
+	float3 p2 = float4_to_float3(P_curve[1]);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	ssef P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+	}
+
+	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+	ssef r12 = or12;
+	const ssef vP = load4f(P);
+	const ssef dif = vP - P_curve[0];
+	const ssef dif_second = vP - P_curve[1];
+	if(difl != 0.0f) {
+		const ssef len1_sq = len3_squared_splat(dif);
+		const ssef len2_sq = len3_squared_splat(dif_second);
+		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+		r12 = max(or12, pixelsize12);
+	}
+	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+	const ssef p21_diff = P_curve[1] - P_curve[0];
+	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+	const ssef dir = load4f(direction);
+	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const ssef tg = p21_diff * invl;
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const ssef cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const ssef tdif = madd(ssef(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if(td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		// bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			// backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#  undef len3_squared
+#  undef len3
+#  undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      const Intersection *isect,
+                                      const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float3 tg;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p[4];
+		p[0] = float4_to_float3(P_curve[0]);
+		p[1] = float4_to_float3(P_curve[1]);
+		p[2] = float4_to_float3(P_curve[2]);
+		p[3] = float4_to_float3(P_curve[3]);
+
+		P = P + D*t;
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		}
+		else {
+			/* direction from inside to surface of curve */
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+			sd->Ng = normalize(P - p_curr);
+
+			/* adjustment for changing radius */
+			float gd = isect->v;
+
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		/* todo: sometimes the normal is still so that this is detected as
+		 * backfacing even if cull backfaces is enabled */
+
+		sd->N = sd->Ng;
+	}
+	else {
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		float l = 1.0f;
+		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+
+		P = P + D*t;
+
+		float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			float gd = isect->v;
+
+			/* direction from inside to surface of curve */
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+			/* adjustment for changing radius */
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index dc1388b6643..fad29e431ec 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -33,7 +33,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object,
 	 * zero iterations and rendering is really slow with motion curves. For until other
 	 * areas are speed up it's probably not so crucial to optimize this out.
 	 */
-	uint attr_offset = object*kernel_data.bvh.attributes_map_stride + ATTR_PRIM_CURVE;
+	uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_CURVE;
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 
 	while(attr_map.x != id) {
@@ -152,7 +152,7 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
 	keys[3] = (1.0f - t)*keys[3] + t*next_keys[3];
 }
 
-#ifdef __KERNEL_AVX2__
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
 /* Similar to above, but returns keys as pair of two AVX registers with each
  * holding two float4.
  */
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 4e84aa97776..7ac6807e749 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem)
 {
 	/* todo: find a better (faster) solution for this, maybe store offset per object */
-	uint attr_offset = object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = object_attribute_map_offset(kg, object);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
 	while(attr_map.x != id) {
@@ -117,4 +117,39 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i
 	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
 }
 
+ccl_device_inline float3 motion_triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+{
+	/* get motion info */
+	int numsteps, numverts;
+	object_motion_info(kg, object, &numsteps, &numverts, NULL);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(time*maxstep), maxstep-1);
+	float t = time*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch normals */
+	float3 normals[3], next_normals[3];
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+
+	motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+	motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
+
+	/* interpolate between steps */
+	normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
+	normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
+	normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
+
+	/* interpolate between vertices */
+	float w = 1.0f - u - v;
+	float3 N = safe_normalize(u*normals[0] + v*normals[1] + w*normals[2]);
+
+	return is_zero(N)? Ng: N;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index d57d74ea882..542843edc84 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -97,17 +97,17 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
  * for instancing.
  */
 
-#ifdef __SUBSURFACE__
+#ifdef __BVH_LOCAL__
 #  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
 ccl_device_noinline
 #  else
 ccl_device_inline
 #  endif
-float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray,
-                                         float3 verts[3])
+float3 motion_triangle_refine_local(KernelGlobals *kg,
+                                    ShaderData *sd,
+                                    const Intersection *isect,
+                                    const Ray *ray,
+                                    float3 verts[3])
 {
 	float3 P = ray->P;
 	float3 D = ray->D;
@@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 #  ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -159,21 +159,22 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 	return P + D*t;
 #  endif  /* __INTERSECTION_REFINE__ */
 }
-#endif  /* __SUBSURFACE__ */
+#endif  /* __BVH_LOCAL__ */
 
 
 /* Ray intersection. We simply compute the vertex positions at the given ray
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
-                                                 Intersection *isect,
-                                                 float3 P,
-                                                 float3 dir,
-                                                 float time,
-                                                 uint visibility,
-                                                 int object,
-                                                 int prim_addr)
+ccl_device_inline bool motion_triangle_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        float3 P,
+        float3 dir,
+        float time,
+        uint visibility,
+        int object,
+        int prim_addr)
 {
 	/* Primitive index for vertex location lookup. */
 	int prim = kernel_tex_fetch(__prim_index, prim_addr);
@@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
 	motion_triangle_vertices(kg, fobject, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             isect->t,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
 	{
 #ifdef __VISIBILITY_FLAG__
 		/* Visibility flag test. we do it here under the assumption
@@ -210,71 +215,99 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
 	return false;
 }
 
-/* Special ray intersection routines for subsurface scattering. In that case we
+/* Special ray intersection routines for local intersections. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
  */
-#ifdef __SUBSURFACE__
-ccl_device_inline void motion_triangle_intersect_subsurface(
+#ifdef __BVH_LOCAL__
+ccl_device_inline void motion_triangle_intersect_local(
         KernelGlobals *kg,
-        SubsurfaceIntersection *ss_isect,
+        LocalIntersection *local_isect,
         float3 P,
         float3 dir,
         float time,
         int object,
+        int local_object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
 {
+	/* Only intersect with matching object, for instanced objects we
+	 * already know we are only intersecting the right object. */
+	if(object == OBJECT_NONE) {
+		if(kernel_tex_fetch(__prim_object, prim_addr) != local_object) {
+			return;
+		}
+	}
+
 	/* Primitive index for vertex location lookup. */
 	int prim = kernel_tex_fetch(__prim_index, prim_addr);
-	int fobject = (object == OBJECT_NONE)
-	                  ? kernel_tex_fetch(__prim_object, prim_addr)
-	                  : object;
 	/* Get vertex locations for intersection. */
 	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
+	motion_triangle_vertices(kg, local_object, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             tmax,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(!ray_triangle_intersect(P,
+	                           dir,
+	                           tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                           (ssef*)verts,
+#else
+	                           verts[0], verts[1], verts[2],
+#endif
+	                           &u, &v, &t))
 	{
-		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
-			if(ss_isect->hits[i].t == t) {
+		return;
+	}
+
+	int hit;
+	if(lcg_state) {
+		/* Record up to max_hits intersections. */
+		for(int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+			if(local_isect->hits[i].t == t) {
 				return;
 			}
 		}
-		ss_isect->num_hits++;
-		int hit;
-		if(ss_isect->num_hits <= max_hits) {
-			hit = ss_isect->num_hits - 1;
+
+		local_isect->num_hits++;
+
+		if(local_isect->num_hits <= max_hits) {
+			hit = local_isect->num_hits - 1;
 		}
 		else {
 			/* Reservoir sampling: if we are at the maximum number of
 			 * hits, randomly replace element or skip it.
 			 */
-			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
+			hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
 
 			if(hit >= max_hits)
 				return;
 		}
-		/* Record intersection. */
-		Intersection *isect = &ss_isect->hits[hit];
-		isect->t = t;
-		isect->u = u;
-		isect->v = v;
-		isect->prim = prim_addr;
-		isect->object = object;
-		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		/* Record geometric normal. */
-		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
-		                                    verts[2] - verts[0]));
 	}
+	else {
+		/* Record closest intersection only. */
+		if(local_isect->num_hits && t > local_isect->hits[0].t) {
+			return;
+		}
+
+		hit = 0;
+		local_isect->num_hits = 1;
+	}
+
+	/* Record intersection. */
+	Intersection *isect = &local_isect->hits[hit];
+	isect->t = t;
+	isect->u = u;
+	isect->v = v;
+	isect->prim = prim_addr;
+	isect->object = object;
+	isect->type = PRIMITIVE_MOTION_TRIANGLE;
+
+	/* Record geometric normal. */
+	local_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
+	                                       verts[2] - verts[0]));
 }
-#endif  /* __SUBSURFACE__ */
+#endif  /* __BVH_LOCAL__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 0e024a05db6..4789137d5b0 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -36,29 +36,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
                                                       ShaderData *sd, const
                                                       Intersection *isect,
                                                       const Ray *ray,
-                                                      bool subsurface)
+                                                      bool is_local)
 {
 	/* Get shader. */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 	/* Get motion info. */
 	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
 	 * can we de-duplicate something here?
 	 */
 	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
 	/* Figure out which steps we need to fetch and their interpolation factor. */
 	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
 	/* Find attribute. */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+	int offset = find_attribute_motion(kg, sd->object,
 	                                   ATTR_STD_MOTION_VERTEX_POSITION,
 	                                   &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 	/* Fetch vertex coordinates. */
 	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
 	/* Interpolate between steps. */
@@ -66,40 +66,40 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
 	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
 	/* Compute refined position. */
-#ifdef __SUBSURFACE__
-	if(subsurface) {
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
-		                                                     sd,
-		                                                     isect,
-		                                                     ray,
-		                                                     verts);
+#ifdef __BVH_LOCAL__
+	if(is_local) {
+		sd->P = motion_triangle_refine_local(kg,
+		                                     sd,
+		                                     isect,
+		                                     ray,
+		                                     verts);
 	}
 	else
-#endif  /*  __SUBSURFACE__*/
+#endif  /*  __BVH_LOCAL__*/
 	{
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
 	}
 	/* Compute face normal. */
 	float3 Ng;
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	}
 	else {
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 	}
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
+	sd->Ng = Ng;
+	sd->N = Ng;
 	/* Compute derivatives of P w.r.t. uv. */
 #ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
 #endif
 	/* Compute smooth normal. */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
 		/* Find attribute. */
 		AttributeElement elem;
 		int offset = find_attribute_motion(kg,
-		                                   ccl_fetch(sd, object),
+		                                   sd->object,
 		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
 		                                   &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
@@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 		/* Interpolate between vertices. */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
+		float u = sd->u;
+		float v = sd->v;
 		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index f51b2d18657..800649abf38 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -28,61 +28,44 @@ CCL_NAMESPACE_BEGIN
 
 enum ObjectTransform {
 	OBJECT_TRANSFORM = 0,
-	OBJECT_TRANSFORM_MOTION_PRE = 0,
-	OBJECT_INVERSE_TRANSFORM = 4,
-	OBJECT_TRANSFORM_MOTION_POST = 4,
-	OBJECT_PROPERTIES = 8,
-	OBJECT_DUPLI = 9
+	OBJECT_INVERSE_TRANSFORM = 1,
 };
 
 enum ObjectVectorTransform {
-	OBJECT_VECTOR_MOTION_PRE = 0,
-	OBJECT_VECTOR_MOTION_POST = 3
+	OBJECT_PASS_MOTION_PRE = 0,
+	OBJECT_PASS_MOTION_POST = 1
 };
 
 /* Object to world space transformation */
 
 ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
 {
-	int offset = object*OBJECT_SIZE + (int)type;
-
-	Transform tfm;
-	tfm.x = kernel_tex_fetch(__objects, offset + 0);
-	tfm.y = kernel_tex_fetch(__objects, offset + 1);
-	tfm.z = kernel_tex_fetch(__objects, offset + 2);
-	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-	return tfm;
+	if(type == OBJECT_INVERSE_TRANSFORM) {
+		return kernel_tex_fetch(__objects, object).itfm;
+	}
+	else {
+		return kernel_tex_fetch(__objects, object).tfm;
+	}
 }
 
 /* Lamp to world space transformation */
 
 ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
 {
-	int offset = lamp*LIGHT_SIZE + (inverse? 8 : 5);
-
-	Transform tfm;
-	tfm.x = kernel_tex_fetch(__light_data, offset + 0);
-	tfm.y = kernel_tex_fetch(__light_data, offset + 1);
-	tfm.z = kernel_tex_fetch(__light_data, offset + 2);
-	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-	return tfm;
+	if(inverse) {
+		return kernel_tex_fetch(__lights, lamp).itfm;
+	}
+	else {
+		return kernel_tex_fetch(__lights, lamp).tfm;
+	}
 }
 
 /* Object to world space transformation for motion vectors */
 
-ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
+ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
 {
-	int offset = object*OBJECT_VECTOR_SIZE + (int)type;
-
-	Transform tfm;
-	tfm.x = kernel_tex_fetch(__objects_vector, offset + 0);
-	tfm.y = kernel_tex_fetch(__objects_vector, offset + 1);
-	tfm.z = kernel_tex_fetch(__objects_vector, offset + 2);
-	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-	return tfm;
+	int offset = object*OBJECT_MOTION_PASS_SIZE + (int)type;
+	return kernel_tex_fetch(__object_motion_pass, offset);
 }
 
 /* Motion blurred object transformations */
@@ -90,22 +73,12 @@ ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int
 #ifdef __OBJECT_MOTION__
 ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
 {
-	DecompMotionTransform motion;
-
-	int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE;
-
-	motion.mid.x = kernel_tex_fetch(__objects, offset + 0);
-	motion.mid.y = kernel_tex_fetch(__objects, offset + 1);
-	motion.mid.z = kernel_tex_fetch(__objects, offset + 2);
-	motion.mid.w = kernel_tex_fetch(__objects, offset + 3);
-
-	motion.pre_x = kernel_tex_fetch(__objects, offset + 4);
-	motion.pre_y = kernel_tex_fetch(__objects, offset + 5);
-	motion.post_x = kernel_tex_fetch(__objects, offset + 6);
-	motion.post_y = kernel_tex_fetch(__objects, offset + 7);
+	const uint motion_offset = kernel_tex_fetch(__objects, object).motion_offset;
+	const ccl_global DecomposedTransform *motion = &kernel_tex_fetch(__object_motion, motion_offset);
+	const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
 
 	Transform tfm;
-	transform_motion_interpolate(&tfm, &motion, time);
+	transform_motion_array_interpolate(&tfm, motion, num_steps, time);
 
 	return tfm;
 }
@@ -137,9 +110,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+	*P = transform_point_auto(&sd->ob_tfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -149,9 +122,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+	*P = transform_point_auto(&sd->ob_itfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -161,12 +134,16 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
-		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
 	}
 #else
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	if(sd->object != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		*N = normalize(transform_direction_transposed(&tfm, *N));
+	}
+	else if(sd->type == PRIMITIVE_LAMP) {
+		Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
 		*N = normalize(transform_direction_transposed(&tfm, *N));
 	}
 #endif
@@ -177,9 +154,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+	*N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -189,9 +166,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+	*D = transform_direction_auto(&sd->ob_tfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -201,9 +178,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+	*D = transform_direction_auto(&sd->ob_itfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -212,13 +189,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, object) == OBJECT_NONE)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -227,9 +204,7 @@ ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd
 
 ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
 {
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.x;
+	return kernel_tex_fetch(__objects, object).surface_area;
 }
 
 /* Pass ID number of object */
@@ -239,9 +214,17 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return 0.0f;
 
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.y;
+	return kernel_tex_fetch(__objects, object).pass_id;
+}
+
+/* Per lamp random number for shader variation */
+
+ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+{
+	if(lamp == LAMP_NONE)
+		return 0.0f;
+
+	return kernel_tex_fetch(__lights, lamp).random;
 }
 
 /* Per object random number for shader variation */
@@ -251,9 +234,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return 0.0f;
 
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.z;
+	return kernel_tex_fetch(__objects, object).random_number;
 }
 
 /* Particle ID from which this object was generated */
@@ -263,9 +244,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return 0;
 
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return __float_as_uint(f.w);
+	return kernel_tex_fetch(__objects, object).particle_index;
 }
 
 /* Generated texture coordinate on surface from where object was instanced */
@@ -275,9 +254,10 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return make_float3(f.x, f.y, f.z);
+	const ccl_global KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+	return make_float3(kobject->dupli_generated[0],
+	                   kobject->dupli_generated[1],
+	                   kobject->dupli_generated[2]);
 }
 
 /* UV texture coordinate on surface from where object was instanced */
@@ -287,27 +267,24 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
-	float4 f = kernel_tex_fetch(__objects, offset + 1);
-	return make_float3(f.x, f.y, 0.0f);
+	const ccl_global KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+	return make_float3(kobject->dupli_uv[0],
+	                   kobject->dupli_uv[1],
+	                   0.0f);
 }
 
 /* Information about mesh for motion blurred triangles and curves */
 
 ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
 {
-	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
-
 	if(numkeys) {
-		float4 f = kernel_tex_fetch(__objects, offset);
-		*numkeys = __float_as_int(f.w);
+		*numkeys = kernel_tex_fetch(__objects, object).numkeys;
 	}
 
-	float4 f = kernel_tex_fetch(__objects, offset + 1);
 	if(numsteps)
-		*numsteps = __float_as_int(f.z);
+		*numsteps = kernel_tex_fetch(__objects, object).numsteps;
 	if(numverts)
-		*numverts = __float_as_int(f.w);
+		*numverts = kernel_tex_fetch(__objects, object).numverts;
 }
 
 /* Offset to an objects patch map */
@@ -317,76 +294,56 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 	if(object == OBJECT_NONE)
 		return 0;
 
-	int offset = object*OBJECT_SIZE + 11;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return __float_as_uint(f.x);
+	return kernel_tex_fetch(__objects, object).patch_map_offset;
 }
 
 /* Pass ID for shader */
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+	return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Particle data from which object was instanced */
 
-ccl_device_inline float particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.x;
+	return kernel_tex_fetch(__particles, particle).index;
 }
 
 ccl_device float particle_age(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.y;
+	return kernel_tex_fetch(__particles, particle).age;
 }
 
 ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.z;
+	return kernel_tex_fetch(__particles, particle).lifetime;
 }
 
 ccl_device float particle_size(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.w;
+	return kernel_tex_fetch(__particles, particle).size;
 }
 
 ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 1);
-	return f;
+	return kernel_tex_fetch(__particles, particle).rotation;
 }
 
 ccl_device float3 particle_location(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 2);
-	return make_float3(f.x, f.y, f.z);
+	return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
 }
 
 ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f2 = kernel_tex_fetch(__particles, offset + 2);
-	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
-	return make_float3(f2.w, f3.x, f3.y);
+	return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
 }
 
 ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 {
-	int offset = particle*PARTICLE_SIZE;
-	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
-	float4 f4 = kernel_tex_fetch(__particles, offset + 4);
-	return make_float3(f3.z, f3.w, f4.x);
+	return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
 }
 
 /* Object intersection in BVH */
@@ -415,17 +372,18 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
 	return rcp(dir);
-#else
-	return 1.0f / dir;
-#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -435,8 +393,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -473,16 +434,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_pop(KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
-	if(*t != FLT_MAX) {
+	if(t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-		*t /= len(transform_direction(&tfm, ray->D));
+		t /= len(transform_direction(&tfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
@@ -501,13 +470,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
                                                 float3 *dir,
                                                 float3 *idir,
-                                                ccl_addr_space float *t,
+                                                float t,
                                                 Transform *itfm)
 {
 	object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -518,8 +487,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -557,22 +529,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
 
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg,
-                                               int object,
-                                               const Ray *ray,
-                                               float3 *P,
-                                               float3 *dir,
-                                               float3 *idir,
-                                               ccl_addr_space float *t,
-                                               Transform *itfm)
-{
-	if(*t != FLT_MAX) {
-		*t /= len(transform_direction(itfm, ray->D));
+ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+                                                int object,
+                                                const Ray *ray,
+                                                float3 *P,
+                                                float3 *dir,
+                                                float3 *idir,
+                                                float t,
+                                                Transform *itfm)
+{
+	if(t != FLT_MAX) {
+		t /= len(transform_direction(itfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 8a73bb2f78b..c159be92885 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
                                                   const AttributeDescriptor desc,
                                                   float *dx, float *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                     const AttributeDescriptor desc,
                                                     float3 *dx, float3 *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float3(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #  ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #  else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #  endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+		return cross(sd->N, normalize(cross(data, sd->N)));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &center);
 		}
 	}
 	else
 #endif
-		center = ccl_fetch(sd, P);
+		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
 		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
 		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_motion_pass_transform(kg, sd->object, OBJECT_PASS_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_motion_pass_transform(kg, sd->object, OBJECT_PASS_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
@@ -204,31 +204,31 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	/* camera motion, for perspective/orthographic motion.pre/post will be a
 	 * world-to-raster matrix, for panorama it's world-to-camera */
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
-		tfm = kernel_data.cam.worldtoraster;
-		motion_center = transform_perspective(&tfm, center);
+		ProjectionTransform projection = kernel_data.cam.worldtoraster;
+		motion_center = transform_perspective(&projection, center);
 
-		tfm = kernel_data.cam.motion.pre;
-		motion_pre = transform_perspective(&tfm, motion_pre);
+		projection = kernel_data.cam.perspective_pre;
+		motion_pre = transform_perspective(&projection, motion_pre);
 
-		tfm = kernel_data.cam.motion.post;
-		motion_post = transform_perspective(&tfm, motion_post);
+		projection = kernel_data.cam.perspective_post;
+		motion_post = transform_perspective(&projection, motion_post);
 	}
 	else {
 		tfm = kernel_data.cam.worldtocamera;
 		motion_center = normalize(transform_point(&tfm, center));
-		motion_center = float2_to_float3(direction_to_panorama(kg, motion_center));
+		motion_center = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_center));
 		motion_center.x *= kernel_data.cam.width;
 		motion_center.y *= kernel_data.cam.height;
 
-		tfm = kernel_data.cam.motion.pre;
+		tfm = kernel_data.cam.motion_pass_pre;
 		motion_pre = normalize(transform_point(&tfm, motion_pre));
-		motion_pre = float2_to_float3(direction_to_panorama(kg, motion_pre));
+		motion_pre = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_pre));
 		motion_pre.x *= kernel_data.cam.width;
 		motion_pre.y *= kernel_data.cam.height;
 
-		tfm = kernel_data.cam.motion.post;
+		tfm = kernel_data.cam.motion_pass_post;
 		motion_post = normalize(transform_point(&tfm, motion_post));
-		motion_post = float2_to_float3(direction_to_panorama(kg, motion_post));
+		motion_post = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_post));
 		motion_post.x *= kernel_data.cam.width;
 		motion_post.y *= kernel_data.cam.height;
 	}
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
 {
-	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+	return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
 {
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
 	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float a, dads, dadt;
 		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
 		float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float3 a, dads, dadt;
 
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
 		float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3229091bbb0..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* return normal */
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		return normalize(cross(v2 - v0, v1 - v0));
 	}
 	else {
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Interpolate smooth vertex normal from vertices */
 
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
 	/* load triangle vertices */
 	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
 	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+	float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+	return is_zero(N)? Ng: N;
 }
 
 /* Ray differentials on triangle */
@@ -110,34 +112,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
 		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -153,24 +155,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float3 f0, f1, f2;
 
 		if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +187,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 4db121d94f4..a3b23115ae4 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -22,414 +22,153 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
- * component of float3 value.
- */
-#ifndef __KERNEL_CPU__
-#  define IDX(vec, idx) \
-    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
-#else
-#  define IDX(vec, idx) ((vec)[idx])
-#endif
-
-/* Ray-Triangle intersection for BVH traversal
- *
- * Sven Woop
- * Watertight Ray/Triangle Intersection
- *
- * http://jcgt.org/published/0002/01/05/paper.pdf
- */
-
-/* Precalculated data for the ray->tri intersection. */
-typedef struct IsectPrecalc {
-	/* Maximal dimension kz, and orthogonal dimensions. */
-	int kx, ky, kz;
-
-	/* Shear constants. */
-	float Sx, Sy, Sz;
-} IsectPrecalc;
-
-#if (defined(__KERNEL_OPENCL_APPLE__)) || \
-    (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)))
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-void triangle_intersect_precalc(float3 dir,
-                                IsectPrecalc *isect_precalc)
-{
-	/* Calculate dimension where the ray direction is maximal. */
-#ifndef __KERNEL_SSE__
-	int kz = util_max_axis(make_float3(fabsf(dir.x),
-	                                   fabsf(dir.y),
-	                                   fabsf(dir.z)));
-	int kx = kz + 1; if(kx == 3) kx = 0;
-	int ky = kx + 1; if(ky == 3) ky = 0;
-#else
-	int kx, ky, kz;
-	/* Avoiding mispredicted branch on direction. */
-	kz = util_max_axis(fabs(dir));
-	static const char inc_xaxis[] = {1, 2, 0, 55};
-	static const char inc_yaxis[] = {2, 0, 1, 55};
-	kx = inc_xaxis[kz];
-	ky = inc_yaxis[kz];
-#endif
-
-	float dir_kz = IDX(dir, kz);
-
-	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(dir_kz < 0.0f) {
-		int tmp = kx;
-		kx = ky;
-		ky = tmp;
-	}
-
-	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / dir_kz;
-	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
-	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
-	isect_precalc->Sz = inv_dir_z;
-
-	/* Store the dimensions. */
-	isect_precalc->kx = kx;
-	isect_precalc->ky = ky;
-	isect_precalc->kz = kz;
-}
-
-/* TODO(sergey): Make it general utility function. */
-ccl_device_inline float xor_signmask(float x, int y)
-{
-	return __int_as_float(__float_as_int(x) ^ y);
-}
-
 ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
-                                          const IsectPrecalc *isect_precalc,
                                           Intersection *isect,
                                           float3 P,
+                                          float3 dir,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permute_mask);
-	const avxf BC_k = shuffle(BC, permute_mask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return false;
-	}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return false;
-	}
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return false;
-	}
-
-	/* Calculate scaled z-coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > isect->t * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          ssef_verts,
+#else
+	                          float4_to_float3(tri_a),
+	                          float4_to_float3(tri_b),
+	                          float4_to_float3(tri_c),
+#endif
+	                          &u, &v, &t))
 	{
-		return false;
-	}
-
 #ifdef __VISIBILITY_FLAG__
-	/* visibility flag test. we do it here under the assumption
-	 * that most triangles are culled by node flags */
-	if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
 #endif
-	{
-#ifdef __KERNEL_CUDA__
-		if(A == B && B == C) {
-			return false;
+		{
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_TRIANGLE;
+			isect->u = u;
+			isect->v = v;
+			isect->t = t;
+			return true;
 		}
-#endif
-		/* Normalize U, V, W, and T. */
-		const float inv_det = 1.0f / det;
-		isect->prim = prim_addr;
-		isect->object = object;
-		isect->type = PRIMITIVE_TRIANGLE;
-		isect->u = U * inv_det;
-		isect->v = V * inv_det;
-		isect->t = T * inv_det;
-		return true;
 	}
 	return false;
 }
 
-/* Special ray intersection routines for subsurface scattering. In that case we
+/* Special ray intersection routines for local intersection. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
  */
 
-#ifdef __SUBSURFACE__
-ccl_device_inline void triangle_intersect_subsurface(
+#ifdef __BVH_LOCAL__
+ccl_device_inline void triangle_intersect_local(
         KernelGlobals *kg,
-        const IsectPrecalc *isect_precalc,
-        SubsurfaceIntersection *ss_isect,
+        LocalIntersection *local_isect,
         float3 P,
+        float3 dir,
         int object,
+		int local_object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
-	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
-	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
-	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permuteMask);
-	const avxf BC_k = shuffle(BC, permuteMask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return;
+	/* Only intersect with matching object, for instanced objects we
+	 * already know we are only intersecting the right object. */
+	if(object == OBJECT_NONE) {
+		if(kernel_tex_fetch(__prim_object, prim_addr) != local_object) {
+			return;
+		}
 	}
+
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return;
-	}
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return;
-	}
-
-	/* Calculate scaled z−coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > tmax * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(!ray_triangle_intersect(P,
+	                           dir,
+	                           tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                           ssef_verts,
+#else
+	                           tri_a, tri_b, tri_c,
+#endif
+	                           &u, &v, &t))
 	{
 		return;
 	}
 
-	/* Normalize U, V, W, and T. */
-	const float inv_det = 1.0f / det;
-
-	const float t = T * inv_det;
-	for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
-		if(ss_isect->hits[i].t == t) {
-			return;
+	int hit;
+	if(lcg_state) {
+		/* Record up to max_hits intersections. */
+		for(int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+			if(local_isect->hits[i].t == t) {
+				return;
+			}
 		}
-	}
 
-	ss_isect->num_hits++;
-	int hit;
+		local_isect->num_hits++;
 
-	if(ss_isect->num_hits <= max_hits) {
-		hit = ss_isect->num_hits - 1;
+		if(local_isect->num_hits <= max_hits) {
+			hit = local_isect->num_hits - 1;
+		}
+		else {
+			/* reservoir sampling: if we are at the maximum number of
+			 * hits, randomly replace element or skip it */
+			hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+
+			if(hit >= max_hits)
+				return;
+		}
 	}
 	else {
-		/* reservoir sampling: if we are at the maximum number of
-		 * hits, randomly replace element or skip it */
-		hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
-
-		if(hit >= max_hits)
+		/* Record closest intersection only. */
+		if(local_isect->num_hits && t > local_isect->hits[0].t) {
 			return;
+		}
+
+		hit = 0;
+		local_isect->num_hits = 1;
 	}
 
-	/* record intersection */
-	Intersection *isect = &ss_isect->hits[hit];
+	/* Record intersection. */
+	Intersection *isect = &local_isect->hits[hit];
 	isect->prim = prim_addr;
 	isect->object = object;
 	isect->type = PRIMITIVE_TRIANGLE;
-	isect->u = U * inv_det;
-	isect->v = V * inv_det;
+	isect->u = u;
+	isect->v = v;
 	isect->t = t;
 
 	/* Record geometric normal. */
-	/* TODO(sergey): Use float4_to_float3() on just an edges. */
-	const float3 v0 = float4_to_float3(tri_a);
-	const float3 v1 = float4_to_float3(tri_b);
-	const float3 v2 = float4_to_float3(tri_c);
-	ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
-}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
+	local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+}
+#endif  /* __BVH_LOCAL__ */
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
  * far the precision is often not so good, this reintersects the primitive from
@@ -457,7 +196,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #  endif
@@ -491,7 +230,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #  endif
@@ -508,10 +247,10 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 /* Same as above, except that isect->t is assumed to be in object space for
  * instancing.
  */
-ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const Intersection *isect,
-                                                    const Ray *ray)
+ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               const Intersection *isect,
+                                               const Ray *ray)
 {
 	float3 P = ray->P;
 	float3 D = ray->D;
@@ -519,7 +258,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -557,7 +296,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -570,6 +309,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	return P;
 }
 
-#undef IDX
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 03724c955be..346f228e961 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -29,21 +29,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
-ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
-{
-	float4 r;
-	switch(id) {
-		case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
-		case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break;
-		case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break;
-		case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break;
-		case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break;
-	}
-	return r;
-}
-#endif  /* __KERNEL_CUDA__ */
-
 ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
@@ -64,24 +49,9 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
-	float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
-	float4 r = make_float4(f, f, f, 1.0f);
-#  else
-	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
-#  endif
-#elif defined(__KERNEL_OPENCL__)
-	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
-#else
-	float4 r;
-	if(sd->flag & SD_VOLUME_CUBIC)
-		r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
-	else
-		r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
-#endif
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE;
+	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
 
 	if(dx) *dx = 0.0f;
 	if(dy) *dy = 0.0f;
@@ -91,28 +61,20 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
-	float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
-#  else
-	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
-#  endif
-#elif defined(__KERNEL_OPENCL__)
-	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
-#else
-	float4 r;
-	if(sd->flag & SD_VOLUME_CUBIC)
-		r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
-	else
-		r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
-#endif
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE;
+	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 	if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-	return float4_to_float3(r);
+	if(r.w > 1e-6f && r.w != 1.0f) {
+		/* For RGBA colors, unpremultiply after interpolation. */
+		return float4_to_float3(r) / r.w;
+	}
+	else {
+		return float4_to_float3(r);
+	}
 }
 
 #endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..cfb35dd33f5 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,7 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct KernelGlobals;
+struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
@@ -38,40 +40,26 @@ bool kernel_osl_use(KernelGlobals *kg);
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
 void kernel_tex_copy(KernelGlobals *kg,
                      const char *name,
-                     device_ptr mem,
-                     size_t width,
-                     size_t height,
-                     size_t depth,
-                     InterpolationType interpolation=INTERPOLATION_LINEAR,
-                     ExtensionType extension = EXTENSION_REPEAT);
+                     void *mem,
+                     size_t size);
 
 #define KERNEL_ARCH cpu
-#include "kernels/cpu/kernel_cpu.h"
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_ARCH cpu_avx
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 6c3ee6b8098..c0f281cae97 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN
  * BSDF evaluation result, split per BSDF type. This is used to accumulate
  * render passes separately. */
 
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg,
+                                           const ShaderData *sd);
+
 ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
 {
 #ifdef __PASSES__
@@ -52,10 +55,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 	{
 		eval->diffuse = value;
 	}
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis += value;
+#endif
+	value *= mis_weight;
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		if(CLOSURE_IS_BSDF_DIFFUSE(type))
@@ -96,7 +106,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 	}
 }
 
-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,8 +125,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 	}
 }
 
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+{
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
+	bsdf_eval_mis(eval, value);
+}
+
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		eval->diffuse *= value;
@@ -134,7 +155,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 #endif
 }
 
-ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -160,14 +181,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 
 	if(use_light_pass) {
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
-		L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->color_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
@@ -181,45 +200,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
-
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->background = make_float3(0.0f, 0.0f, 0.0f);
 		L->ao = make_float3(0.0f, 0.0f, 0.0f);
 		L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		L->mist = 0.0f;
+
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else
 #endif
 	{
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
+	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_throughput = 0.0f;
+	L->shadow_transparency = 1.0f;
+	L->has_shadow_catcher = 0;
+#endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_depth = 0.0f;
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	L->debug_data.num_bvh_traversed_nodes = 0;
+	L->debug_data.num_bvh_traversed_instances = 0;
+	L->debug_data.num_bvh_intersections = 0;
+	L->debug_data.num_ray_bounces = 0;
+#endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
-	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
+ccl_device_inline void path_radiance_bsdf_bounce(
+	KernelGlobals *kg,
+	PathRadianceState *L_state,
+	ccl_addr_space float3 *throughput,
+	BsdfEval *bsdf_eval,
+	float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
 
 #ifdef __PASSES__
-	if(L->use_light_pass) {
+	if(kernel_data.film.use_light_pass) {
 		if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
 			/* first on directly visible surface */
 			float3 value = *throughput*inverse_pdf;
 
-			L->path_diffuse = bsdf_eval->diffuse*value;
-			L->path_glossy = bsdf_eval->glossy*value;
-			L->path_transmission = bsdf_eval->transmission*value;
-			L->path_subsurface = bsdf_eval->subsurface*value;
-			L->path_scatter = bsdf_eval->scatter*value;
-
-			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
+			L_state->diffuse = bsdf_eval->diffuse*value;
+			L_state->glossy = bsdf_eval->glossy*value;
+			L_state->transmission = bsdf_eval->transmission*value;
+			L_state->subsurface = bsdf_eval->subsurface*value;
+			L_state->scatter = bsdf_eval->scatter*value;
+
+			*throughput = L_state->diffuse +
+			              L_state->glossy +
+			              L_state->transmission +
+			              L_state->subsurface +
+			              L_state->scatter;
 			
-			L->direct_throughput = *throughput;
+			L_state->direct = *throughput;
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
@@ -234,13 +286,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
 	}
 }
 
-ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
+                                                    ccl_addr_space PathState *state,
+                                                    float3 throughput,
+                                                    float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		return;
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->emission += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -252,11 +313,28 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 	}
 }
 
-ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
+ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              ccl_addr_space PathState *state,
+                                              float3 throughput,
+                                              float3 alpha,
+                                              float3 bsdf,
+                                              float3 ao)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf*ao;
 			L->ao += alpha*throughput*ao;
@@ -273,11 +351,47 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput
 	}
 }
 
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_total_ao(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float3 bsdf)
+{
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf;
+	}
+#else
+	(void) L;
+	(void) state;
+	(void) throughput;
+	(void) bsdf;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+                                                 ccl_addr_space PathState *state,
+                                                 float3 throughput,
+                                                 BsdfEval *bsdf_eval,
+                                                 float3 shadow,
+                                                 float shadow_fac,
+                                                 bool is_lamp)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -303,13 +417,47 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	}
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_total_light(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        const BsdfEval *bsdf_eval)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf_eval->sum_no_mis;
+	}
+#else
+	(void) L;
+	(void) state;
+	(void) throughput;
+	(void) bsdf_eval;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_background(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float3 value)
+{
+
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		L->path_total_shaded += throughput * value * L->shadow_transparency;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
 			L->background += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -319,7 +467,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th
 	{
 		L->emission += throughput*value;
 	}
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_albedo += state->denoising_feature_weight * value;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
+ccl_device_inline void path_radiance_accum_transparent(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput)
+{
+	L->transparent += average(throughput);
+}
+
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_accum_shadowcatcher(
+        PathRadiance *L,
+        float3 throughput,
+        float3 background)
+{
+	L->shadow_throughput += average(throughput);
+	L->shadow_background_color += throughput * background;
+	L->has_shadow_catcher = 1;
 }
+#endif
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 {
@@ -328,19 +500,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 	 * only a single throughput further along the path, here we recover just
 	 * the indirect path that is not influenced by any particular BSDF type */
 	if(L->use_light_pass) {
-		L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput);
-		L->direct_diffuse += L->path_diffuse*L->direct_emission;
-		L->direct_glossy += L->path_glossy*L->direct_emission;
-		L->direct_transmission += L->path_transmission*L->direct_emission;
-		L->direct_subsurface += L->path_subsurface*L->direct_emission;
-		L->direct_scatter += L->path_scatter*L->direct_emission;
-
-		L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
-		L->indirect_diffuse += L->path_diffuse*L->indirect;
-		L->indirect_glossy += L->path_glossy*L->indirect;
-		L->indirect_transmission += L->path_transmission*L->indirect;
-		L->indirect_subsurface += L->path_subsurface*L->indirect;
-		L->indirect_scatter += L->path_scatter*L->indirect;
+		L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
+		L->direct_diffuse += L->state.diffuse*L->direct_emission;
+		L->direct_glossy += L->state.glossy*L->direct_emission;
+		L->direct_transmission += L->state.transmission*L->direct_emission;
+		L->direct_subsurface += L->state.subsurface*L->direct_emission;
+		L->direct_scatter += L->state.scatter*L->direct_emission;
+
+		L->indirect = safe_divide_color(L->indirect, L->state.direct);
+		L->indirect_diffuse += L->state.diffuse*L->indirect;
+		L->indirect_glossy += L->state.glossy*L->indirect;
+		L->indirect_transmission += L->state.transmission*L->indirect;
+		L->indirect_subsurface += L->state.subsurface*L->indirect;
+		L->indirect_scatter += L->state.scatter*L->indirect;
 	}
 #endif
 }
@@ -349,11 +521,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -366,11 +538,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = L_src->path_diffuse;
-		L->path_glossy = L_src->path_glossy;
-		L->path_transmission = L_src->path_transmission;
-		L->path_subsurface = L_src->path_subsurface;
-		L->path_scatter = L_src->path_scatter;
+		L->state = L_src->state;
 
 		L->direct_emission = L_src->direct_emission;
 		L->indirect = L_src->indirect;
@@ -378,7 +546,40 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 #endif
 }
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+                                                       PathRadiance *L,
+                                                       float3 *L_sum,
+                                                       float *alpha)
+{
+	/* Calculate current shadow of the path. */
+	float path_total = average(L->path_total);
+	float shadow;
+
+	if(UNLIKELY(!isfinite_safe(path_total))) {
+		kernel_assert(!"Non-finite total radiance along the path");
+		shadow = 0.0f;
+	}
+	else if(path_total == 0.0f) {
+		shadow = L->shadow_transparency;
+	}
+	else {
+		float path_total_shaded = average(L->path_total_shaded);
+		shadow = path_total_shaded / path_total;
+	}
+
+	/* Calculate final light sum and transparency for shadow catcher object. */
+	if(kernel_data.background.transparent) {
+		*alpha -= L->shadow_throughput * shadow;
+	}
+	else {
+		L->shadow_background_color *= shadow;
+		*L_sum += L->shadow_background_color;
+	}
+}
+#endif
+
+ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha)
 {
 	float3 L_sum;
 	/* Light Passes are used */
@@ -399,7 +600,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
 
 		/* Reject invalid value */
-		if(!isfinite(sum)) {
+		if(!isfinite_safe(sum)) {
 			kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
 			L_sum = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -455,8 +656,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 			L_sum = L_direct + L_indirect;
 		}
 #endif
-
-		return L_sum;
 	}
 
 	/* No Light Passes */
@@ -464,42 +663,105 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 #endif
 	{
 		L_sum = L->emission;
+
+		/* Reject invalid value */
+		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
+		if(!isfinite_safe(sum)) {
+			kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
+			L_sum = make_float3(0.0f, 0.0f, 0.0f);
+		}
 	}
 
-	/* Reject invalid value */
-	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-	if(!isfinite(sum)) {
-		kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+	/* Compute alpha. */
+	*alpha = 1.0f - L->transparent;
+
+	/* Add shadow catcher contributions. */
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
 	}
+#endif  /* __SHADOW_TRICKS__ */
 
 	return L_sum;
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+	kernel_assert(L->use_light_pass);
+
+	*clean = L->emission + L->background;
+	*noisy = L->direct_scatter + L->indirect_scatter;
+
+#  define ADD_COMPONENT(flag, component)     \
+	if(kernel_data.film.denoising_flags & flag) \
+		*clean += component;                 \
+	else                                     \
+		*noisy += component;
+
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR,      L->direct_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND,      L->indirect_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR,       L->direct_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND,       L->indirect_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR,   L->direct_subsurface);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND,   L->indirect_subsurface);
+#  undef ADD_COMPONENT
+#else
+	*noisy = L->emission;
+	*clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		*noisy += L->shadow_background_color;
+	}
+#endif
+
+	*noisy = ensure_finite3(*noisy);
+	*clean = ensure_finite3(*clean);
+}
+
+ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
 {
-	float fac = 1.0f/num_samples;
+#ifdef __SPLIT_KERNEL__
+#  define safe_float3_add(f, v) \
+	do { \
+		ccl_global float *p = (ccl_global float*)(&(f)); \
+		atomic_add_and_fetch_float(p+0, (v).x); \
+		atomic_add_and_fetch_float(p+1, (v).y); \
+		atomic_add_and_fetch_float(p+2, (v).z); \
+	} while(0)
+#  define safe_float_add(f, v) \
+		atomic_add_and_fetch_float(&(f), (v))
+#else
+#  define safe_float3_add(f, v) (f) += (v)
+#  define safe_float_add(f, v) (f) += (v)
+#endif  /* __SPLIT_KERNEL__ */
 
 #ifdef __PASSES__
-	L->direct_diffuse += L_sample->direct_diffuse*fac;
-	L->direct_glossy += L_sample->direct_glossy*fac;
-	L->direct_transmission += L_sample->direct_transmission*fac;
-	L->direct_subsurface += L_sample->direct_subsurface*fac;
-	L->direct_scatter += L_sample->direct_scatter*fac;
-
-	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
-	L->indirect_glossy += L_sample->indirect_glossy*fac;
-	L->indirect_transmission += L_sample->indirect_transmission*fac;
-	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
-	L->indirect_scatter += L_sample->indirect_scatter*fac;
-
-	L->background += L_sample->background*fac;
-	L->ao += L_sample->ao*fac;
-	L->shadow += L_sample->shadow*fac;
-	L->mist += L_sample->mist*fac;
-#endif
-	L->emission += L_sample->emission * fac;
+	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
+	safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
+	safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
+	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
+	safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+
+	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
+	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
+	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
+	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
+	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+
+	safe_float3_add(L->background, L_sample->background);
+	safe_float3_add(L->ao, L_sample->ao);
+	safe_float3_add(L->shadow, L_sample->shadow);
+	safe_float_add(L->mist, L_sample->mist);
+#endif  /* __PASSES__ */
+	safe_float3_add(L->emission, L_sample->emission);
+
+#undef safe_float_add
+#undef safe_float3_add
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 5bcc57cdcdf..79e6d1b4862 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void compute_light_pass(KernelGlobals *kg,
                                           ShaderData *sd,
                                           PathRadiance *L,
-                                          RNG rng,
+                                          uint rng_hash,
                                           int pass_filter,
                                           int sample)
 {
@@ -48,13 +48,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
 	/* init path state */
-	path_state_init(kg, &emission_sd, &state, &rng, sample, NULL);
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
 
 	/* evaluate surface shader */
-	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	shader_eval_surface(kg, sd, &state, state.flag);
 
-	/* TODO, disable the closures we won't need */
+	/* TODO, disable more closures we don't need besides transparent */
+	shader_bsdf_disable_transparency(kg, sd);
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched) {
@@ -63,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 		bool is_sss_sample = false;
@@ -85,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			                                  &emission_sd,
 			                                  &L_sample,
 			                                  &state,
-			                                  &rng,
 			                                  &ray,
 			                                  &throughput,
 			                                  &ss_indirect))
@@ -100,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 					kernel_path_indirect(kg,
 					                     &indirect_sd,
 					                     &emission_sd,
-					                     &rng,
 					                     &ray,
 					                     throughput,
-					                     state.num_samples,
 					                     &state,
 					                     &L_sample);
-					kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample);
 				}
 				is_sss_sample = true;
 			}
@@ -115,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample light and BSDF */
 		if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-			kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample);
+			kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
 
-			if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+			if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
 #ifdef __LAMP_MIS__
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
+				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
 
 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
@@ -136,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 #ifdef __SUBSURFACE__
@@ -150,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 		if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
 			kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
-				&emission_sd, &L_sample, &state, &rng, &ray, throughput);
+				&emission_sd, &L_sample, &state, &ray, throughput);
 		}
 #endif
 
@@ -160,31 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
 				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, &rng,
+				kernel_branched_path_surface_connect_light(kg,
 					sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
 			}
 #endif
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, &rng,
+			kernel_branched_path_surface_indirect_light(kg,
 				sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
 		}
 	}
 #endif
 
 	/* accumulate into master L */
-	path_radiance_accum_sample(L, &L_sample, 1);
-}
-
-ccl_device bool is_aa_pass(ShaderEvalType type)
-{
-	switch(type) {
-		case SHADER_EVAL_UV:
-		case SHADER_EVAL_NORMAL:
-			return false;
-		default:
-			return true;
-	}
+	path_radiance_accum_sample(L, &L_sample);
 }
 
 /* this helps with AA but it's not the real solution as it does not AA the geometry
@@ -224,7 +209,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
 
 ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       RNG *rng,
                                                        PathState *state,
                                                        float3 direct,
                                                        float3 indirect,
@@ -244,12 +228,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
 		}
 		else {
 			/* surface color of the pass only */
-			shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+			shader_eval_surface(kg, sd, state, 0);
 			return kernel_bake_shader_bsdf(kg, sd, type);
 		}
 	}
 	else {
-		shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, sd, state, 0);
 		color = kernel_bake_shader_bsdf(kg, sd, type);
 	}
 
@@ -291,14 +275,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
+	uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
 
 	float filter_x, filter_y;
 	if(sample == 0) {
 		filter_x = filter_y = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+		path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
 	}
 
 	/* subpixel u/v offset */
@@ -332,20 +316,44 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	sd.dv.dx = dvdx;
 	sd.dv.dy = dvdy;
 
+	/* set RNG state for shaders that use sampling */
+	state.rng_hash = rng_hash;
+	state.rng_offset = 0;
+	state.sample = sample;
+	state.num_samples = num_samples;
+	state.min_ray_pdf = FLT_MAX;
+
 	/* light passes if we need more than color */
 	if(pass_filter & ~BAKE_FILTER_COLOR)
-		compute_light_pass(kg, &sd, &L, rng, pass_filter, sample);
+		compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
 
 	switch(type) {
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
+		case SHADER_EVAL_ROUGHNESS:
+		case SHADER_EVAL_EMISSION:
 		{
-			if((sd.flag & SD_HAS_BUMP)) {
-				shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN);
+			if(type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
+				int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
+				shader_eval_surface(kg, &sd, &state, path_flag);
 			}
 
-			/* compression: normal = (2 * color) - 1 */
-			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+			if(type == SHADER_EVAL_NORMAL) {
+				float3 N = sd.N;
+				if(sd.flag & SD_HAS_BUMP) {
+					N = shader_bsdf_average_normal(kg, &sd);
+				}
+
+				/* encoding: normal = (2 * color) - 1 */
+				out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+			}
+			else if(type == SHADER_EVAL_ROUGHNESS) {
+				float roughness = shader_bsdf_average_roughness(&sd);
+				out = make_float3(roughness, roughness, roughness);
+			}
+			else {
+				out = shader_emissive_eval(kg, &sd);
+			}
 			break;
 		}
 		case SHADER_EVAL_UV:
@@ -353,13 +361,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 			out = primitive_uv(kg, &sd);
 			break;
 		}
-		case SHADER_EVAL_EMISSION:
-		{
-			shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION);
-			out = shader_emissive_eval(kg, &sd);
-			break;
-		}
-
 #ifdef __PASSES__
 		/* light passes */
 		case SHADER_EVAL_AO:
@@ -370,7 +371,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		case SHADER_EVAL_COMBINED:
 		{
 			if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-				out = path_radiance_clamp_and_sum(kg, &L);
+				float alpha;
+				out = path_radiance_clamp_and_sum(kg, &L, &alpha);
 				break;
 			}
 
@@ -408,7 +410,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_diffuse,
 			                                           L.indirect_diffuse,
@@ -420,7 +421,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_glossy,
 			                                           L.indirect_glossy,
@@ -432,7 +432,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_transmission,
 			                                           L.indirect_transmission,
@@ -445,7 +444,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 #ifdef __SUBSURFACE__
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_subsurface,
 			                                           L.indirect_subsurface,
@@ -479,7 +477,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 			/* evaluate */
 			int flag = 0; /* we can't know which type of BSDF this is for */
-			out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+			out = shader_eval_background(kg, &sd, &state, flag);
 			break;
 		}
 		default:
@@ -491,86 +489,77 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	}
 
 	/* write output */
-	const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+	const float output_fac = 1.0f/num_samples;
 	const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 
-	output[i] = (sample == 0)?  scaled_result: output[i] + scaled_result;
+	output[i] = (sample == 0)? scaled_result: output[i] + scaled_result;
 }
 
 #endif  /* __BAKING__ */
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
-                                       ccl_global uint4 *input,
-                                       ccl_global float4 *output,
-                                       ccl_global float *output_luma,
-                                       ShaderEvalType type,
-                                       int i,
-                                       int sample)
+ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
+                                         ccl_global uint4 *input,
+                                         ccl_global float4 *output,
+                                         int i)
 {
 	ShaderData sd;
 	PathState state = {0};
 	uint4 in = input[i];
-	float3 out;
 
-	if(type == SHADER_EVAL_DISPLACE) {
-		/* setup shader data */
-		int object = in.x;
-		int prim = in.y;
-		float u = __uint_as_float(in.z);
-		float v = __uint_as_float(in.w);
+	/* setup shader data */
+	int object = in.x;
+	int prim = in.y;
+	float u = __uint_as_float(in.z);
+	float v = __uint_as_float(in.w);
 
-		shader_setup_from_displace(kg, &sd, object, prim, u, v);
+	shader_setup_from_displace(kg, &sd, object, prim, u, v);
 
-		/* evaluate */
-		float3 P = sd.P;
-		shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN);
-		out = sd.P - P;
+	/* evaluate */
+	float3 P = sd.P;
+	shader_eval_displacement(kg, &sd, &state);
+	float3 D = sd.P - P;
 
-		object_inverse_dir_transform(kg, &sd, &out);
-	}
-	else { // SHADER_EVAL_BACKGROUND
-		/* setup ray */
-		Ray ray;
-		float u = __uint_as_float(in.x);
-		float v = __uint_as_float(in.y);
-
-		ray.P = make_float3(0.0f, 0.0f, 0.0f);
-		ray.D = equirectangular_to_direction(u, v);
-		ray.t = 0.0f;
+	object_inverse_dir_transform(kg, &sd, &D);
+
+	/* write output */
+	output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+}
+
+ccl_device void kernel_background_evaluate(KernelGlobals *kg,
+                                           ccl_global uint4 *input,
+                                           ccl_global float4 *output,
+                                           int i)
+{
+	ShaderData sd;
+	PathState state = {0};
+	uint4 in = input[i];
+
+	/* setup ray */
+	Ray ray;
+	float u = __uint_as_float(in.x);
+	float v = __uint_as_float(in.y);
+
+	ray.P = make_float3(0.0f, 0.0f, 0.0f);
+	ray.D = equirectangular_to_direction(u, v);
+	ray.t = 0.0f;
 #ifdef __CAMERA_MOTION__
-		ray.time = 0.5f;
+	ray.time = 0.5f;
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray.dD = differential3_zero();
-		ray.dP = differential3_zero();
+	ray.dD = differential3_zero();
+	ray.dP = differential3_zero();
 #endif
 
-		/* setup shader data */
-		shader_setup_from_background(kg, &sd, &ray);
+	/* setup shader data */
+	shader_setup_from_background(kg, &sd, &ray);
+
+	/* evaluate */
+	int flag = 0; /* we can't know which type of BSDF this is for */
+	float3 color = shader_eval_background(kg, &sd, &state, flag);
 
-		/* evaluate */
-		int flag = 0; /* we can't know which type of BSDF this is for */
-		out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
-	}
-	
 	/* write output */
-	if(sample == 0) {
-		if(output != NULL) {
-			output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-		}
-		if(output_luma != NULL) {
-			output_luma[i] = average(out);
-		}
-	}
-	else {
-		if(output != NULL) {
-			output[i] += make_float4(out.x, out.y, out.z, 0.0f);
-		}
-		if(output_luma != NULL) {
-			output_luma[i] += average(out);
-		}
-	}
+	output[i] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..b73ad47dad3 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
 
-ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
+ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u, float v)
 {
-	float blades = kernel_data.cam.blades;
+	float blades = cam->blades;
 	float2 bokeh;
 
 	if(blades == 0.0f) {
@@ -29,12 +29,12 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	}
 	else {
 		/* sample polygon */
-		float rotation = kernel_data.cam.bladesrotation;
+		float rotation = cam->bladesrotation;
 		bokeh = regular_polygon_sample(blades, rotation, u, v);
 	}
 
 	/* anamorphic lens bokeh */
-	bokeh.x *= kernel_data.cam.inv_aperture_ratio;
+	bokeh.x *= cam->inv_aperture_ratio;
 
 	return bokeh;
 }
@@ -42,7 +42,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
-	Transform rastertocamera = kernel_data.cam.rastertocamera;
+	ProjectionTransform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 raster = make_float3(raster_x, raster_y, 0.0f);
 	float3 Pcamera = transform_perspective(&rastertocamera, raster);
 
@@ -54,13 +54,13 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 		 * interpolated field of view.
 		 */
 		if(ray->time < 0.5f) {
-			Transform rastertocamera_pre = kernel_data.cam.perspective_motion.pre;
+			ProjectionTransform rastertocamera_pre = kernel_data.cam.perspective_pre;
 			float3 Pcamera_pre =
 			        transform_perspective(&rastertocamera_pre, raster);
 			Pcamera = interp(Pcamera_pre, Pcamera, ray->time * 2.0f);
 		}
 		else {
-			Transform rastertocamera_post = kernel_data.cam.perspective_motion.post;
+			ProjectionTransform rastertocamera_post = kernel_data.cam.perspective_post;
 			float3 Pcamera_post =
 			        transform_perspective(&rastertocamera_post, raster);
 			Pcamera = interp(Pcamera, Pcamera_post, (ray->time - 0.5f) * 2.0f);
@@ -76,7 +76,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 
 	if(aperturesize > 0.0f) {
 		/* sample point on aperture */
-		float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize;
+		float2 lensuv = camera_sample_aperture(&kernel_data.cam, lens_u, lens_v)*aperturesize;
 
 		/* compute point on plane of focus */
 		float ft = kernel_data.cam.focaldistance/D.z;
@@ -91,17 +91,12 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion) {
-#  ifdef __KERNEL_OPENCL__
-		const MotionTransform tfm = kernel_data.cam.motion;
-		transform_motion_interpolate(&cameratoworld,
-		                             ((const DecompMotionTransform*)&tfm),
-		                             ray->time);
-#  else
-		transform_motion_interpolate(&cameratoworld,
-		                             ((const DecompMotionTransform*)&kernel_data.cam.motion),
-		                             ray->time);
-#  endif
+	if(kernel_data.cam.num_motion_steps) {
+		transform_motion_array_interpolate(
+			&cameratoworld,
+			kernel_tex_array(__camera_motion),
+			kernel_data.cam.num_motion_steps,
+			ray->time);
 	}
 #endif
 
@@ -124,7 +119,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 	}
 	else {
 		/* Spherical stereo */
-		spherical_stereo_transform(kg, &P, &D);
+		spherical_stereo_transform(&kernel_data.cam, &P, &D);
 		ray->P = P;
 		ray->D = D;
 
@@ -138,12 +133,12 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 		float3 Pcenter = Pnostereo;
 		float3 Dcenter = Pcamera;
 		Dcenter = normalize(transform_direction(&cameratoworld, Dcenter));
-		spherical_stereo_transform(kg, &Pcenter, &Dcenter);
+		spherical_stereo_transform(&kernel_data.cam, &Pcenter, &Dcenter);
 
 		float3 Px = Pnostereo;
 		float3 Dx = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f));
 		Dx = normalize(transform_direction(&cameratoworld, Dx));
-		spherical_stereo_transform(kg, &Px, &Dx);
+		spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
 
 		ray->dP.dx = Px - Pcenter;
 		ray->dD.dx = Dx - Dcenter;
@@ -151,7 +146,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 		float3 Py = Pnostereo;
 		float3 Dy = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
 		Dy = normalize(transform_direction(&cameratoworld, Dy));
-		spherical_stereo_transform(kg, &Py, &Dy);
+		spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
 
 		ray->dP.dy = Py - Pcenter;
 		ray->dD.dy = Dy - Dcenter;
@@ -175,7 +170,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
-	Transform rastertocamera = kernel_data.cam.rastertocamera;
+	ProjectionTransform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
 
 	float3 P;
@@ -186,7 +181,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 	if(aperturesize > 0.0f) {
 		/* sample point on aperture */
-		float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize;
+		float2 lensuv = camera_sample_aperture(&kernel_data.cam, lens_u, lens_v)*aperturesize;
 
 		/* compute point on plane of focus */
 		float3 Pfocus = D * kernel_data.cam.focaldistance;
@@ -203,17 +198,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion) {
-#  ifdef __KERNEL_OPENCL__
-		const MotionTransform tfm = kernel_data.cam.motion;
-		transform_motion_interpolate(&cameratoworld,
-		                             (const DecompMotionTransform*)&tfm,
-		                             ray->time);
-#  else
-		transform_motion_interpolate(&cameratoworld,
-		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
-		                             ray->time);
-#  endif
+	if(kernel_data.cam.num_motion_steps) {
+		transform_motion_array_interpolate(
+			&cameratoworld,
+			kernel_tex_array(__camera_motion),
+			kernel_data.cam.num_motion_steps,
+			ray->time);
 	}
 #endif
 
@@ -238,17 +228,18 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
+ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
+                                              const ccl_global DecomposedTransform *cam_motion,
                                               float raster_x, float raster_y,
                                               float lens_u, float lens_v,
                                               ccl_addr_space Ray *ray)
 {
-	Transform rastertocamera = kernel_data.cam.rastertocamera;
+	ProjectionTransform rastertocamera = cam->rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
 
 	/* create ray form raster position */
 	float3 P = make_float3(0.0f, 0.0f, 0.0f);
-	float3 D = panorama_to_direction(kg, Pcamera.x, Pcamera.y);
+	float3 D = panorama_to_direction(cam, Pcamera.x, Pcamera.y);
 
 	/* indicates ray should not receive any light, outside of the lens */
 	if(is_zero(D)) {
@@ -257,15 +248,15 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 	}
 
 	/* modify ray for depth of field */
-	float aperturesize = kernel_data.cam.aperturesize;
+	float aperturesize = cam->aperturesize;
 
 	if(aperturesize > 0.0f) {
 		/* sample point on aperture */
-		float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize;
+		float2 lensuv = camera_sample_aperture(cam, lens_u, lens_v)*aperturesize;
 
 		/* compute point on plane of focus */
 		float3 Dfocus = normalize(D);
-		float3 Pfocus = Dfocus * kernel_data.cam.focaldistance;
+		float3 Pfocus = Dfocus * cam->focaldistance;
 
 		/* calculate orthonormal coordinates perpendicular to Dfocus */
 		float3 U, V;
@@ -278,20 +269,15 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 	}
 
 	/* transform ray from camera to world */
-	Transform cameratoworld = kernel_data.cam.cameratoworld;
+	Transform cameratoworld = cam->cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion) {
-#  ifdef __KERNEL_OPENCL__
-		const MotionTransform tfm = kernel_data.cam.motion;
-		transform_motion_interpolate(&cameratoworld,
-		                             (const DecompMotionTransform*)&tfm,
-		                             ray->time);
-#  else
-		transform_motion_interpolate(&cameratoworld,
-		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
-		                             ray->time);
-#  endif
+	if(cam->num_motion_steps) {
+		transform_motion_array_interpolate(
+			&cameratoworld,
+			cam_motion,
+			cam->num_motion_steps,
+			ray->time);
 	}
 #endif
 
@@ -299,9 +285,9 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 	D = normalize(transform_direction(&cameratoworld, D));
 
 	/* Stereo transform */
-	bool use_stereo = kernel_data.cam.interocular_offset != 0.0f;
+	bool use_stereo = cam->interocular_offset != 0.0f;
 	if(use_stereo) {
-		spherical_stereo_transform(kg, &P, &D);
+		spherical_stereo_transform(cam, &P, &D);
 	}
 
 	ray->P = P;
@@ -313,30 +299,30 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 	 * ray origin and direction for the center and two neighbouring pixels
 	 * and simply take their differences. */
 	float3 Pcenter = Pcamera;
-	float3 Dcenter = panorama_to_direction(kg, Pcenter.x, Pcenter.y);
+	float3 Dcenter = panorama_to_direction(cam, Pcenter.x, Pcenter.y);
 	Pcenter = transform_point(&cameratoworld, Pcenter);
 	Dcenter = normalize(transform_direction(&cameratoworld, Dcenter));
 	if(use_stereo) {
-		spherical_stereo_transform(kg, &Pcenter, &Dcenter);
+		spherical_stereo_transform(cam, &Pcenter, &Dcenter);
 	}
 
 	float3 Px = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f));
-	float3 Dx = panorama_to_direction(kg, Px.x, Px.y);
+	float3 Dx = panorama_to_direction(cam, Px.x, Px.y);
 	Px = transform_point(&cameratoworld, Px);
 	Dx = normalize(transform_direction(&cameratoworld, Dx));
 	if(use_stereo) {
-		spherical_stereo_transform(kg, &Px, &Dx);
+		spherical_stereo_transform(cam, &Px, &Dx);
 	}
 
 	ray->dP.dx = Px - Pcenter;
 	ray->dD.dx = Dx - Dcenter;
 
 	float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
-	float3 Dy = panorama_to_direction(kg, Py.x, Py.y);
+	float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
 	Py = transform_point(&cameratoworld, Py);
 	Dy = normalize(transform_direction(&cameratoworld, Dy));
 	if(use_stereo) {
-		spherical_stereo_transform(kg, &Py, &Dy);
+		spherical_stereo_transform(cam, &Py, &Dy);
 	}
 
 	ray->dP.dy = Py - Pcenter;
@@ -345,11 +331,11 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 
 #ifdef __CAMERA_CLIPPING__
 	/* clipping */
-	float nearclip = kernel_data.cam.nearclip;
+	float nearclip = cam->nearclip;
 	ray->P += nearclip * ray->D;
 	ray->dP.dx += nearclip * ray->dD.dx;
 	ray->dP.dy += nearclip * ray->dD.dy;
-	ray->t = kernel_data.cam.cliplength;
+	ray->t = cam->cliplength;
 #else
 	ray->t = FLT_MAX;
 #endif
@@ -410,12 +396,16 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
 #endif
 
 	/* sample */
-	if(kernel_data.cam.type == CAMERA_PERSPECTIVE)
+	if(kernel_data.cam.type == CAMERA_PERSPECTIVE) {
 		camera_sample_perspective(kg, raster_x, raster_y, lens_u, lens_v, ray);
-	else if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+	}
+	else if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) {
 		camera_sample_orthographic(kg, raster_x, raster_y, lens_u, lens_v, ray);
-	else
-		camera_sample_panorama(kg, raster_x, raster_y, lens_u, lens_v, ray);
+	}
+	else {
+		const ccl_global DecomposedTransform *cam_motion = kernel_tex_array(__camera_motion);
+		camera_sample_panorama(&kernel_data.cam, cam_motion, raster_x, raster_y, lens_u, lens_v, ray);
+	}
 }
 
 /* Utilities */
@@ -457,22 +447,22 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
-		Transform tfm = kernel_data.cam.worldtondc;
+		ProjectionTransform tfm = kernel_data.cam.worldtondc;
 		return transform_perspective(&tfm, P);
 	}
 	else {
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(ccl_fetch(sd, object) != OBJECT_NONE)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
 
-		float2 uv = direction_to_panorama(kg, P);
+		float2 uv = direction_to_panorama(&kernel_data.cam, P);
 
 		return make_float3(uv.x, uv.y, 0.0f);
 	}
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 9d1f3bdc918..d26b668cb11 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,15 +35,23 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_simd.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_texture.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_texture.h"
 
 #define ccl_addr_space
 
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -65,7 +73,7 @@ CCL_NAMESPACE_BEGIN
  * pointer lookup. */
 
 template<typename T> struct texture  {
-	ccl_always_inline T fetch(int index)
+	ccl_always_inline const T& fetch(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
 		return data[index];
@@ -78,9 +86,9 @@ template<typename T> struct texture  {
 	ccl_always_inline avxf fetch_avxf(const int index)
 	{
 		kernel_assert(index >= 0 && (index+1) < width);
-		ssef *ssefData = (ssef*)data;
-		ssef *ssefNodeData = &ssefData[index];
-		return _mm256_loadu_ps((float *)ssefNodeData);
+		ssef *ssef_data = (ssef*)data;
+		ssef *ssef_node_data = &ssef_data[index];
+		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
 
 #endif
@@ -103,420 +111,6 @@ template<typename T> struct texture  {
 	int width;
 };
 
-template<typename T> struct texture_image  {
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-	{ \
-		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
-		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
-		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
-		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
-
-	ccl_always_inline float4 read(float4 r)
-	{
-		return r;
-	}
-
-	ccl_always_inline float4 read(uchar4 r)
-	{
-		float f = 1.0f/255.0f;
-		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
-	}
-
-	ccl_always_inline float4 read(uchar r)
-	{
-		float f = r*(1.0f/255.0f);
-		return make_float4(f, f, f, 1.0f);
-	}
-
-	ccl_always_inline float4 read(float r)
-	{
-		/* TODO(dingto): Optimize this, so interpolation
-		 * happens on float instead of float4 */
-		return make_float4(r, r, r, 1.0f);
-	}
-
-	ccl_always_inline float4 read(half4 r)
-	{
-		return half4_to_float4(r);
-	}
-
-	ccl_always_inline float4 read(half r)
-	{
-		float f = half_to_float(r);
-		return make_float4(f, f, f, 1.0f);
-	}
-
-	ccl_always_inline int wrap_periodic(int x, int width)
-	{
-		x %= width;
-		if(x < 0)
-			x += width;
-		return x;
-	}
-
-	ccl_always_inline int wrap_clamp(int x, int width)
-	{
-		return clamp(x, 0, width-1);
-	}
-
-	ccl_always_inline float frac(float x, int *ix)
-	{
-		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
-		*ix = i;
-		return x - (float)i;
-	}
-
-	ccl_always_inline float4 interp(float x, float y)
-	{
-		if(UNLIKELY(!data))
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-		int ix, iy, nix, niy;
-
-		if(interpolation == INTERPOLATION_CLOSEST) {
-			frac(x*(float)width, &ix);
-			frac(y*(float)height, &iy);
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-			return read(data[ix + iy*width]);
-		}
-		else if(interpolation == INTERPOLATION_LINEAR) {
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
-			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
-			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
-			r += ty*tx*read(data[nix + niy*width]);
-
-			return r;
-		}
-		else {
-			/* Bicubic b-spline interpolation. */
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-			int pix, piy, nnix, nniy;
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-
-					pix = wrap_periodic(ix-1, width);
-					piy = wrap_periodic(iy-1, height);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-
-					nnix = wrap_periodic(ix+2, width);
-					nniy = wrap_periodic(iy+2, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					pix = wrap_clamp(ix-1, width);
-					piy = wrap_clamp(iy-1, height);
-
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-
-					nnix = wrap_clamp(ix+2, width);
-					nniy = wrap_clamp(iy+2, height);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			const int xc[4] = {pix, ix, nix, nnix};
-			const int yc[4] = {width * piy,
-			                   width * iy,
-			                   width * niy,
-			                   width * nniy};
-			float u[4], v[4];
-			/* Some helper macro to keep code reasonable size,
-			 * let compiler to inline all the matrix multiplications.
-			 */
-#define DATA(x, y) (read(data[xc[x] + yc[y]]))
-#define TERM(col) \
-			(v[col] * (u[0] * DATA(0, col) + \
-			           u[1] * DATA(1, col) + \
-			           u[2] * DATA(2, col) + \
-			           u[3] * DATA(3, col)))
-
-			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-			/* Actual interpolation. */
-			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-
-#undef TERM
-#undef DATA
-		}
-	}
-
-	ccl_always_inline float4 interp_3d(float x, float y, float z)
-	{
-		return interp_3d_ex(x, y, z, interpolation);
-	}
-
-	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
-	                                      int interpolation = INTERPOLATION_LINEAR)
-	{
-		if(UNLIKELY(!data))
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-		int ix, iy, iz, nix, niy, niz;
-
-		if(interpolation == INTERPOLATION_CLOSEST) {
-			frac(x*(float)width, &ix);
-			frac(y*(float)height, &iy);
-			frac(z*(float)depth, &iz);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			return read(data[ix + iy*width + iz*width*height]);
-		}
-		else if(interpolation == INTERPOLATION_LINEAR) {
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-			float tz = frac(z*(float)depth - 0.5f, &iz);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					niz = wrap_periodic(iz+1, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-					niz = wrap_clamp(iz+1, depth);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			float4 r;
-
-			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
-			r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
-			r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
-			r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
-
-			r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
-			r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
-			r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
-			r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
-
-			return r;
-		}
-		else {
-			/* Tricubic b-spline interpolation. */
-			const float tx = frac(x*(float)width - 0.5f, &ix);
-			const float ty = frac(y*(float)height - 0.5f, &iy);
-			const float tz = frac(z*(float)depth - 0.5f, &iz);
-			int pix, piy, piz, nnix, nniy, nniz;
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-
-					pix = wrap_periodic(ix-1, width);
-					piy = wrap_periodic(iy-1, height);
-					piz = wrap_periodic(iz-1, depth);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					niz = wrap_periodic(iz+1, depth);
-
-					nnix = wrap_periodic(ix+2, width);
-					nniy = wrap_periodic(iy+2, height);
-					nniz = wrap_periodic(iz+2, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					pix = wrap_clamp(ix-1, width);
-					piy = wrap_clamp(iy-1, height);
-					piz = wrap_clamp(iz-1, depth);
-
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-					niz = wrap_clamp(iz+1, depth);
-
-					nnix = wrap_clamp(ix+2, width);
-					nniy = wrap_clamp(iy+2, height);
-					nniz = wrap_clamp(iz+2, depth);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			const int xc[4] = {pix, ix, nix, nnix};
-			const int yc[4] = {width * piy,
-			                   width * iy,
-			                   width * niy,
-			                   width * nniy};
-			const int zc[4] = {width * height * piz,
-			                   width * height * iz,
-			                   width * height * niz,
-			                   width * height * nniz};
-			float u[4], v[4], w[4];
-
-			/* Some helper macro to keep code reasonable size,
-			 * let compiler to inline all the matrix multiplications.
-			 */
-#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
-#define COL_TERM(col, row) \
-			(v[col] * (u[0] * DATA(0, col, row) + \
-			           u[1] * DATA(1, col, row) + \
-			           u[2] * DATA(2, col, row) + \
-			           u[3] * DATA(3, col, row)))
-#define ROW_TERM(row) \
-			(w[row] * (COL_TERM(0, row) + \
-			           COL_TERM(1, row) + \
-			           COL_TERM(2, row) + \
-			           COL_TERM(3, row)))
-
-			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-			/* Actual interpolation. */
-			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
-
-#undef COL_TERM
-#undef ROW_TERM
-#undef DATA
-		}
-	}
-
-	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
-	{
-		width = width_;
-		height = height_;
-		depth = depth_;
-	}
-
-	T *data;
-	int interpolation;
-	ExtensionType extension;
-	int width, height, depth;
-#undef SET_CUBIC_SPLINE_WEIGHTS
-};
-
-typedef texture<float4> texture_float4;
-typedef texture<float2> texture_float2;
-typedef texture<float> texture_float;
-typedef texture<uint> texture_uint;
-typedef texture<int> texture_int;
-typedef texture<uint4> texture_uint4;
-typedef texture<uchar4> texture_uchar4;
-typedef texture<uchar> texture_uchar;
-typedef texture_image<float> texture_image_float;
-typedef texture_image<uchar> texture_image_uchar;
-typedef texture_image<half> texture_image_half;
-typedef texture_image<float4> texture_image_float4;
-typedef texture_image<uchar4> texture_image_uchar4;
-typedef texture_image<half4> texture_image_half4;
-
 /* Macros to handle different memory storage on different devices */
 
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
@@ -524,10 +118,7 @@ typedef texture_image<half4> texture_image_half4;
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-
-#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
-#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
-#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
+#define kernel_tex_array(tex) (kg->tex.data)
 
 #define kernel_data (kg->__data)
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..ac63bcf7ac9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -30,75 +30,114 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <float.h>
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned short half;
+typedef unsigned long long CUtexObject;
+
+#define FLT_MIN 1.175494350822287507969e-38f
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+
+__device__ half __float2half(const float f)
+{
+       half val;
+       asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+       return val;
+}
 
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
-#  define ccl_device_forceinline  __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#if __CUDA_ARCH__ < 500
 #  define ccl_device_inline  __device__ __forceinline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #else
 #  define ccl_device_inline  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #endif
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
-#define ccl_constant
+#define ccl_static_constant __constant__
+#define ccl_constant const
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
 #define ccl_align(n) __align__(n)
 
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
 /* Types */
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
 
 /* Textures */
 
-typedef texture<float4, 1> texture_float4;
-typedef texture<float2, 1> texture_float2;
-typedef texture<float, 1> texture_float;
-typedef texture<uint, 1> texture_uint;
-typedef texture<int, 1> texture_int;
-typedef texture<uint4, 1> texture_uint4;
-typedef texture<uchar, 1> texture_uchar;
-typedef texture<uchar4, 1> texture_uchar4;
-typedef texture<float4, 2> texture_image_float4;
-typedef texture<float4, 3> texture_image3d_float4;
-typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
-
-/* Macros to handle different memory storage on different devices */
-
-/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
- * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
- *
- * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
- * Using Arrays on Fermi turned out to be slower.*/
-
-/* Fermi */
-#if __CUDA_ARCH__ < 300
-#  define __KERNEL_CUDA_TEX_STORAGE__
-#  define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
-
-#  define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-#  define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
-
-/* Kepler */
-#else
-#  define kernel_tex_fetch(t, index) t[(index)]
-
-#  define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
-#  define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
-#  define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
-#  define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
-#endif
+/* Use arrays for regular data. */
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
 
 #define kernel_data __data
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..671c47e2225 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -36,11 +36,14 @@
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
 #define ccl_may_alias
+#define ccl_static_constant static __constant
 #define ccl_constant __constant
 #define ccl_global __global
 #define ccl_local __local
+#define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
+#define ccl_ref
 #define ccl_align(n) __attribute__((aligned(n)))
 
 #ifdef __SPLIT_KERNEL__
@@ -49,6 +52,17 @@
 #  define ccl_addr_space
 #endif
 
+#define ATTR_FALLTHROUGH
+
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -117,6 +131,7 @@
 #  define expf(x) native_exp(((float)(x)))
 #  define sqrtf(x) native_sqrt(((float)(x)))
 #  define logf(x) native_log(((float)(x)))
+#  define rcp(x)  native_recip(x)
 #else
 #  define sinf(x) sin(((float)(x)))
 #  define cosf(x) cos(((float)(x)))
@@ -124,17 +139,19 @@
 #  define expf(x) exp(((float)(x)))
 #  define sqrtf(x) sqrt(((float)(x)))
 #  define logf(x) log(((float)(x)))
+#  define rcp(x)  recip(x))
 #endif
 
 /* data lookup defines */
 #define kernel_data (*kg->data)
-#define kernel_tex_fetch(t, index) kg->t[index]
+#define kernel_tex_array(tex) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
+#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
 
 /* define NULL */
 #define NULL 0
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
 
 #endif /* __KERNEL_COMPAT_OPENCL_H__ */
 
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
deleted file mode 100644
index 5647bbae5b5..00000000000
--- a/intern/cycles/kernel/kernel_debug.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void debug_data_init(DebugData *debug_data)
-{
-	debug_data->num_bvh_traversed_nodes = 0;
-	debug_data->num_bvh_traversed_instances = 0;
-	debug_data->num_bvh_intersections = 0;
-	debug_data->num_ray_bounces = 0;
-}
-
-ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 ccl_addr_space PathState *state,
-                                                 DebugData *debug_data,
-                                                 int sample)
-{
-	int flag = kernel_data.film.pass_flag;
-	if(flag & PASS_BVH_TRAVERSED_NODES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
-		                        sample,
-		                        debug_data->num_bvh_traversed_nodes);
-	}
-	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
-		                        sample,
-		                        debug_data->num_bvh_traversed_instances);
-	}
-	if(flag & PASS_BVH_INTERSECTIONS) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
-		                        sample,
-		                        debug_data->num_bvh_intersections);
-	}
-	if(flag & PASS_RAY_BOUNCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
-		                        sample,
-		                        debug_data->num_ray_bounces);
-	}
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..a5556c3be8f 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -29,7 +29,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	/* setup shading at emitter */
 	float3 eval;
 
-	int shader_flag = kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE);
+	int shader_flag = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).flags;
 
 #ifdef __BACKGROUND_MIS__
 	if(ls->type == LIGHT_BACKGROUND) {
@@ -37,25 +37,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.D = ls->D;
 		ray.P = ls->P;
 		ray.t = 1.0f;
-#  ifdef __OBJECT_MOTION__
 		ray.time = time;
-#  endif
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
 		shader_setup_from_background(kg, emission_sd, &ray);
 
 		path_state_modify_bounce(state, true);
-		eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION);
+		eval = shader_eval_background(kg, emission_sd, state, 0);
 		path_state_modify_bounce(state, false);
 	}
 	else
 #endif
 	if(shader_flag & SD_HAS_CONSTANT_EMISSION)
 	{
-		eval.x = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 2));
-		eval.y = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 3));
-		eval.z = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 4));
+		eval.x = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[0];
+		eval.y = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[1];
+		eval.z = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[2];
 		if((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
 			ls->Ng = -ls->Ng;
 		}
@@ -67,19 +65,16 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		                         ls->shader, ls->object, ls->prim,
 		                         ls->u, ls->v, t, time, false, ls->lamp);
 
-		ls->Ng = ccl_fetch(emission_sd, Ng);
+		ls->Ng = emission_sd->Ng;
 
-		/* no path flag, we're evaluating this for all closures. that's weak but
-		 * we'd have to do multiple evaluations otherwise */
+		/* No proper path flag, we're evaluating this for all closures. that's
+		 * weak but we'd have to do multiple evaluations otherwise. */
 		path_state_modify_bounce(state, true);
-		shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, emission_sd, state, PATH_RAY_EMISSION);
 		path_state_modify_bounce(state, false);
 
-		/* evaluate emissive closure */
-		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
-			eval = shader_emissive_eval(kg, emission_sd);
-		else
-			eval = make_float3(0.0f, 0.0f, 0.0f);
+		/* Evaluate emissive closure. */
+		eval = shader_emissive_eval(kg, emission_sd);
 	}
 	
 	eval *= ls->eval_fac;
@@ -112,7 +107,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	                                         -ls->D,
 	                                         dD,
 	                                         ls->t,
-	                                         ccl_fetch(sd, time));
+	                                         sd->time);
 
 	if(is_zero(light_eval))
 		return false;
@@ -120,7 +115,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate BSDF at shading point */
 
 #ifdef __VOLUME__
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 	else {
 		float bsdf_pdf;
@@ -156,8 +151,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
-	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f
+#ifdef __SHADOW_TRICKS__
+	   && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
+#endif
+	  )
+	{
+		float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold;
 		if(probability < 1.0f) {
 			if(rand_terminate >= probability) {
 				return false;
@@ -168,8 +168,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
-		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -182,7 +182,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -204,14 +204,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+		float pdf = triangle_light_pdf(kg, sd, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -314,7 +314,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 #  endif
 
 	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION);
+	float3 L = shader_eval_background(kg, emission_sd, state, state->flag);
 	path_state_modify_bounce(state, false);
 
 #ifdef __BACKGROUND_MIS__
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 2b52a2d2f48..74cfacb5bc1 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,17 @@
 
 /* Constant Globals */
 
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
+#ifdef __KERNEL_CPU__
+#  include "util/util_vector.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "util/util_atomic.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -35,16 +46,8 @@ struct Intersection;
 struct VolumeStep;
 
 typedef struct KernelGlobals {
-	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
-	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
-	texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU];
-	texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
-	texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];
-	texture_image_half texture_half_images[TEX_NUM_HALF_CPU];
-
-#  define KERNEL_TEX(type, ttype, name) ttype name;
-#  define KERNEL_IMAGE_TEX(type, ttype, name)
-#  include "kernel_textures.h"
+#  define KERNEL_TEX(type, name) texture<type> name;
+#  include "kernel/kernel_textures.h"
 
 	KernelData __data;
 
@@ -64,6 +67,13 @@ typedef struct KernelGlobals {
 	/* Storage for decoupled volume steps. */
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
+
+	/* split kernel */
+	SplitData split_data;
+	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
@@ -81,13 +91,8 @@ typedef struct KernelGlobals {
 	Intersection hits_stack[64];
 } KernelGlobals;
 
-#  ifdef __KERNEL_CUDA_TEX_STORAGE__
-#    define KERNEL_TEX(type, ttype, name) ttype name;
-#  else
-#    define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
-#  endif
-#  define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
-#  include "kernel_textures.h"
+#  define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#  include "kernel/kernel_textures.h"
 
 #endif  /* __KERNEL_CUDA__ */
 
@@ -95,19 +100,75 @@ typedef struct KernelGlobals {
 
 #ifdef __KERNEL_OPENCL__
 
+#  define KERNEL_TEX(type, name) \
+typedef type name##_t;
+#  include "kernel/kernel_textures.h"
+
 typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
+	ccl_global char *buffers[8];
 
-#  define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#  include "kernel_textures.h"
+#  define KERNEL_TEX(type, name) \
+	TextureInfo name;
+#  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
-	ShaderData *sd_input;
-	Intersection *isect_shadow;
+	SplitData split_data;
+	SplitParams split_param_data;
 #  endif
 } KernelGlobals;
 
+#define KERNEL_BUFFER_PARAMS \
+	ccl_global char *buffer0, \
+	ccl_global char *buffer1, \
+	ccl_global char *buffer2, \
+	ccl_global char *buffer3, \
+	ccl_global char *buffer4, \
+	ccl_global char *buffer5, \
+	ccl_global char *buffer6, \
+	ccl_global char *buffer7
+
+#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
+
+ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
+{
+#ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#endif
+	{
+		kg->buffers[0] = buffer0;
+		kg->buffers[1] = buffer1;
+		kg->buffers[2] = buffer2;
+		kg->buffers[3] = buffer3;
+		kg->buffers[4] = buffer4;
+		kg->buffers[5] = buffer5;
+		kg->buffers[6] = buffer6;
+		kg->buffers[7] = buffer7;
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
+ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
+{
+#  ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#  endif
+	{
+		ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0];
+
+#  define KERNEL_TEX(type, name) \
+		kg->name = *(info++);
+#  include "kernel/kernel_textures.h"
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
 #endif  /* __KERNEL_OPENCL__ */
 
 /* Interpolated lookup table access */
@@ -146,3 +207,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
 
 CCL_NAMESPACE_END
 
+#endif  /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
deleted file mode 100644
index 0352c58037d..00000000000
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
-
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
-{
-	/* Float4 */
-	if(id < TEX_START_BYTE4_OPENCL) {
-		return kernel_tex_fetch(__tex_image_float4_packed, offset);
-	}
-	/* Byte4 */
-	else if(id < TEX_START_FLOAT_OPENCL) {
-		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
-		float f = 1.0f/255.0f;
-		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
-	}
-	/* Float */
-	else if(id < TEX_START_BYTE_OPENCL) {
-		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
-		return make_float4(f, f, f, 1.0f);
-	}
-	/* Byte */
-	else {
-		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
-		float f = r * (1.0f/255.0f);
-		return make_float4(f, f, f, 1.0f);
-	}
-}
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-	x %= width;
-	if(x < 0)
-		x += width;
-	return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-	return clamp(x, 0, width-1);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
-	*ix = i;
-	return x - (float)i;
-}
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
-	float4 r;
-	int ix, iy, nix, niy;
-	if(interpolation == INTERPOLATION_CLOSEST) {
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = svm_image_texture_read(kg, id, offset + ix + iy*width);
-	}
-	else { /* INTERPOLATION_LINEAR */
-		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width);
-		r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width);
-		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
-		r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
-	}
-
-	return r;
-}
-
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
-{
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
-	float4 r;
-	int ix, iy, iz, nix, niy, niz;
-	if(interpolation == INTERPOLATION_CLOSEST) {
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-		svm_image_texture_frac(z*depth, &iz);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			iz = svm_image_texture_wrap_periodic(iz, depth);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || z < 0.0f ||
-				   x > 1.0f || y > 1.0f || z > 1.0f)
-				 {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			iz = svm_image_texture_wrap_clamp(iz, depth);
-		}
-		r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
-	}
-	else { /* INTERPOLATION_LINEAR */
-		float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
-		float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			iz = svm_image_texture_wrap_periodic(iz, depth);
-
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-			niz = svm_image_texture_wrap_periodic(iz+1, depth);
-		}
-		else {
-			if(extension == EXTENSION_CLIP)
-				if(x < 0.0f || y < 0.0f || z < 0.0f ||
-				   x > 1.0f || y > 1.0f || z > 1.0f)
-				{
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			/* Fall through. */
-			/*  EXTENSION_EXTEND */
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-			niz = svm_image_texture_wrap_clamp(iz+1, depth);
-
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			iz = svm_image_texture_wrap_clamp(iz, depth);
-		}
-
-		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
-		r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height);
-		r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height);
-		r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height);
-
-		r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height);
-		r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
-		r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
-		r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
-
-	}
-
-	return r;
-}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 67546131746..f5855757d3f 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p)
 	return (x + jx)*invN;
 }
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
+ccl_device_inline int cmj_isqrt(int value)
 {
-	kernel_assert(s < N);
-
 #if defined(__KERNEL_CUDA__)
-	int m = float_to_int(__fsqrt_ru(N));
+	return float_to_int(__fsqrt_ru(value));
+#elif defined(__KERNEL_GPU__)
+	return float_to_int(sqrtf(value));
 #else
-	int m = float_to_int(sqrtf(N));
+	/* This is a work around for fast-math on CPU which might replace sqrtf()
+	 * with am approximated version.
+	 */
+	return float_to_int(sqrtf(value) + 1e-6f);
 #endif
+}
+
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+{
+	kernel_assert(s < N);
+
+	int m = cmj_isqrt(N);
 	int n = (N - 1)/m + 1;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..efab69ee37d 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
 		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
 		cu = clamp(cu, -1.0f, 1.0f);
 		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
 		xu = clamp(xu, x0, x1);
 		/* Compute yv. */
 		float z0sq = z0 * z0;
@@ -255,11 +255,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals
                                                                    float3 *lightpos,
                                                                    float3 *dir)
 {
-	float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0);
-	float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3);
+	int portal = kernel_data.integrator.portal_offset + index;
+	const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
 
-	*lightpos = make_float3(data0.y, data0.z, data0.w);
-	*dir = make_float3(data3.y, data3.z, data3.w);
+	*lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+	*dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
 
 	/* Check whether portal is on the right side. */
 	if(dot(*dir, P - *lightpos) > 1e-4f)
@@ -291,11 +291,10 @@ ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
 		}
 		num_possible++;
 
-		float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
-		float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
-
-		float3 axisu = make_float3(data1.y, data1.z, data1.w);
-		float3 axisv = make_float3(data2.y, data2.z, data2.w);
+		int portal = kernel_data.integrator.portal_offset + p;
+		const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+		float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+		float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
 
 		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL))
 			continue;
@@ -346,10 +345,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
 
 		if(portal == 0) {
 			/* p is the portal to be sampled. */
-			float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
-			float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
-			float3 axisu = make_float3(data1.y, data1.z, data1.w);
-			float3 axisv = make_float3(data2.y, data2.z, data2.w);
+			int portal = kernel_data.integrator.portal_offset + p;
+			const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+			float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+			float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
 
 			*pdf = area_light_sample(P, &lightpos,
 			                         axisu, axisv,
@@ -396,11 +395,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
 					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
 				}
 				return D;
-			} else {
+			}
+			else {
 				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
 				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
 			}
-		} else {
+		}
+		else {
 			/* We can't sample a portal.
 			 * Check if we can sample the map instead.
 			 */
@@ -477,14 +478,10 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
+ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, LightSample *ls)
 {
-	float3 dir = make_float3(data2.y, data2.z, data2.w);
 	float3 I = ls->Ng;
 
-	float spot_angle = data1.w;
-	float spot_smooth = data2.x;
-
 	float attenuation = dot(dir, I);
 
 	if(attenuation <= spot_angle) {
@@ -516,12 +513,10 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
                                          float3 P,
                                          LightSample *ls)
 {
-	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
-	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
-
-	LightType type = (LightType)__float_as_int(data0.x);
+	const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+	LightType type = (LightType)klight->type;
 	ls->type = type;
-	ls->shader = __float_as_int(data1.x);
+	ls->shader = klight->shader_id;
 	ls->object = PRIM_NONE;
 	ls->prim = PRIM_NONE;
 	ls->lamp = lamp;
@@ -530,10 +525,10 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 
 	if(type == LIGHT_DISTANT) {
 		/* distant light */
-		float3 lightD = make_float3(data0.y, data0.z, data0.w);
+		float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 		float3 D = lightD;
-		float radius = data1.y;
-		float invarea = data1.w;
+		float radius = klight->distant.radius;
+		float invarea = klight->distant.invarea;
 
 		if(radius > 0.0f)
 			D = distant_light_sample(D, radius, randu, randv);
@@ -545,7 +540,7 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 
 		float costheta = dot(lightD, D);
 		ls->pdf = invarea/(costheta*costheta*costheta);
-		ls->eval_fac = ls->pdf*kernel_data.integrator.inv_pdf_lights;
+		ls->eval_fac = ls->pdf;
 	}
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
@@ -557,14 +552,13 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 		ls->D = -D;
 		ls->t = FLT_MAX;
 		ls->eval_fac = 1.0f;
-		ls->pdf *= kernel_data.integrator.pdf_lights;
 	}
 #endif
 	else {
-		ls->P = make_float3(data0.y, data0.z, data0.w);
+		ls->P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
 		if(type == LIGHT_POINT || type == LIGHT_SPOT) {
-			float radius = data1.y;
+			float radius = klight->spot.radius;
 
 			if(radius > 0.0f)
 				/* sphere light */
@@ -573,14 +567,19 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 			ls->D = normalize_len(ls->P - P, &ls->t);
 			ls->Ng = -ls->D;
 
-			float invarea = data1.z;
+			float invarea = klight->spot.invarea;
 			ls->eval_fac = (0.25f*M_1_PI_F)*invarea;
 			ls->pdf = invarea;
 
 			if(type == LIGHT_SPOT) {
 				/* spot light attenuation */
-				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
-				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
+				float3 dir = make_float3(klight->spot.dir[0],
+                                         klight->spot.dir[1],
+				                         klight->spot.dir[2]);
+				ls->eval_fac *= spot_light_attenuation(dir,
+				                                       klight->spot.spot_angle,
+				                                       klight->spot.spot_smooth,
+				                                       ls);
 				if(ls->eval_fac == 0.0f) {
 					return false;
 				}
@@ -593,12 +592,15 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 		}
 		else {
 			/* area light */
-			float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
-			float4 data3 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 3);
-
-			float3 axisu = make_float3(data1.y, data1.z, data1.w);
-			float3 axisv = make_float3(data2.y, data2.z, data2.w);
-			float3 D = make_float3(data3.y, data3.z, data3.w);
+			float3 axisu = make_float3(klight->area.axisu[0],
+			                           klight->area.axisu[1],
+			                           klight->area.axisu[2]);
+			float3 axisv = make_float3(klight->area.axisv[0],
+			                           klight->area.axisv[1],
+			                           klight->area.axisv[2]);
+			float3 D = make_float3(klight->area.dir[0],
+			                       klight->area.dir[1],
+			                       klight->area.dir[2]);
 
 			if(dot(ls->P - P, D) > 0.0f) {
 				return false;
@@ -617,24 +619,22 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
-			float invarea = data2.x;
+			float invarea = klight->area.invarea;
 			ls->eval_fac = 0.25f*invarea;
 		}
-
-		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 	}
 
+	ls->pdf *= kernel_data.integrator.pdf_lights;
+
 	return (ls->pdf > 0.0f);
 }
 
 ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
 {
-	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
-	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
-
-	LightType type = (LightType)__float_as_int(data0.x);
+	const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+	LightType type = (LightType)klight->type;
 	ls->type = type;
-	ls->shader = __float_as_int(data1.x);
+	ls->shader = klight->shader_id;
 	ls->object = PRIM_NONE;
 	ls->prim = PRIM_NONE;
 	ls->lamp = lamp;
@@ -647,7 +647,7 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 	if(type == LIGHT_DISTANT) {
 		/* distant light */
-		float radius = data1.y;
+		float radius = klight->distant.radius;
 
 		if(radius == 0.0f)
 			return false;
@@ -669,9 +669,9 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		 *             P
 		 */
 
-		float3 lightD = make_float3(data0.y, data0.z, data0.w);
+		float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 		float costheta = dot(-lightD, D);
-		float cosangle = data1.z;
+		float cosangle = klight->distant.cosangle;
 
 		if(costheta < cosangle)
 			return false;
@@ -682,13 +682,14 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->t = FLT_MAX;
 
 		/* compute pdf */
-		float invarea = data1.w;
+		float invarea = klight->distant.invarea;
 		ls->pdf = invarea/(costheta*costheta*costheta);
 		ls->eval_fac = ls->pdf;
 	}
 	else if(type == LIGHT_POINT || type == LIGHT_SPOT) {
-		float3 lightP = make_float3(data0.y, data0.z, data0.w);
-		float radius = data1.y;
+		float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+		float radius = klight->spot.radius;
 
 		/* sphere light */
 		if(radius == 0.0f)
@@ -703,14 +704,19 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->Ng = -D;
 		ls->D = D;
 
-		float invarea = data1.z;
+		float invarea = klight->spot.invarea;
 		ls->eval_fac = (0.25f*M_1_PI_F)*invarea;
 		ls->pdf = invarea;
 
 		if(type == LIGHT_SPOT) {
 			/* spot light attenuation */
-			float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
-			ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
+			float3 dir = make_float3(klight->spot.dir[0],
+			                         klight->spot.dir[1],
+			                         klight->spot.dir[2]);
+			ls->eval_fac *= spot_light_attenuation(dir,
+			                                       klight->spot.spot_angle,
+			                                       klight->spot.spot_smooth,
+			                                       ls);
 
 			if(ls->eval_fac == 0.0f)
 				return false;
@@ -725,22 +731,25 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
-		float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
-		float4 data3 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 3);
-
-		float invarea = data2.x;
+		float invarea = klight->area.invarea;
 		if(invarea == 0.0f)
 			return false;
 
-		float3 axisu = make_float3(data1.y, data1.z, data1.w);
-		float3 axisv = make_float3(data2.y, data2.z, data2.w);
-		float3 Ng = make_float3(data3.y, data3.z, data3.w);
+		float3 axisu = make_float3(klight->area.axisu[0],
+		                           klight->area.axisu[1],
+		                           klight->area.axisu[2]);
+		float3 axisv = make_float3(klight->area.axisv[0],
+		                           klight->area.axisv[1],
+		                           klight->area.axisv[2]);
+		float3 Ng = make_float3(klight->area.dir[0],
+		                        klight->area.dir[1],
+		                        klight->area.dir[2]);
 
 		/* one sided */
 		if(dot(D, Ng) >= 0.0f)
 			return false;
 
-		float3 light_P = make_float3(data0.y, data0.z, data0.w);
+		float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
 		if(!ray_quad_intersect(P, D, 0.0f, t, light_P,
 		                       axisu, axisv, Ng,
@@ -755,86 +764,292 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->pdf = area_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
 		ls->eval_fac = 0.25f*invarea;
 	}
-	else
+	else {
 		return false;
+	}
+
+	ls->pdf *= kernel_data.integrator.pdf_lights;
 
 	return true;
 }
 
 /* Triangle Light */
 
-ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
+/* returns true if the triangle is has motion blur or an instancing transform applied */
+ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
+	bool has_motion = false;
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+	if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
+		motion_triangle_vertices(kg, object, prim, time, V);
+		has_motion = true;
+	}
+	else {
+		triangle_vertices(kg, prim, V);
+	}
+
 #ifdef __INSTANCING__
-	/* instance transform */
-	if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #  ifdef __OBJECT_MOTION__
-		Transform itfm;
-		Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+		float object_time = (time >= 0.0f) ? time : 0.5f;
+		Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL);
 #  else
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
 #  endif
-
-		ls->P = transform_point(&tfm, ls->P);
-		ls->Ng = normalize(transform_direction(&tfm, ls->Ng));
+		V[0] = transform_point(&tfm, V[0]);
+		V[1] = transform_point(&tfm, V[1]);
+		V[2] = transform_point(&tfm, V[2]);
+		has_motion = true;
 	}
 #endif
+	return has_motion;
 }
 
-ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
-	float randu, float randv, float time, LightSample *ls)
+ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
-	float u, v;
+	float pdf = kernel_data.integrator.pdf_triangles;
+	float cos_pi = fabsf(dot(Ng, I));
 
-	/* compute random point in triangle */
-	randu = sqrtf(randu);
+	if(cos_pi == 0.0f)
+		return 0.0f;
 
-	u = 1.0f - randu;
-	v = randv*randu;
+	return t*t*pdf/cos_pi;
+}
+
+ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N = cross(e0, e1);
+	const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N);
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* sd contains the point on the light source
+		 * calculate Px, the point that we're shading */
+		const float3 Px = sd->P + sd->I * t;
+		const float3 v0_p = V[0] - Px;
+		const float3 v1_p = V[1] - Px;
+		const float3 v2_p = V[2] - Px;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float alpha = fast_acosf(dot(u02, u01));
+		const float beta = fast_acosf(-dot(u01, u12));
+		const float gamma = fast_acosf(dot(u02, u12));
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* pdf_triangles is calculated over triangle area, but we're not sampling over its area */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			return 0.0f;
+		}
+		else {
+			float area = 1.0f;
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			else {
+				area = 0.5f * len(N);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			return pdf / solid_angle;
+		}
+	}
+	else {
+		float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t);
+		if(has_motion) {
+			const float	area = 0.5f * len(N);
+			if(UNLIKELY(area == 0.0f)) {
+				return 0.0f;
+			}
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			pdf = pdf * area_pre / area;
+		}
+		return pdf;
+	}
+}
 
-	/* triangle, so get position, normal, shader */
-	triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object,
+	float randu, float randv, float time, LightSample *ls, const float3 P)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N0 = cross(e0, e1);
+	float Nl = 0.0f;
+	ls->Ng = safe_normalize_len(N0, &Nl);
+	float area = 0.5f * Nl;
+
+	/* flip normal if necessary */
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+	if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+		ls->Ng = -ls->Ng;
+	}
+	ls->eval_fac = 1.0f;
+	ls->shader = kernel_tex_fetch(__tri_shader, prim);
 	ls->object = object;
 	ls->prim = prim;
 	ls->lamp = LAMP_NONE;
 	ls->shader |= SHADER_USE_MIS;
-	ls->t = 0.0f;
-	ls->u = u;
-	ls->v = v;
 	ls->type = LIGHT_TRIANGLE;
-	ls->eval_fac = 1.0f;
 
-	object_transform_light_sample(kg, ls, object, time);
-}
+	float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0));
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* see James Arvo, "Stratified Sampling of Spherical Triangles"
+		 * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
+
+		/* project the triangle to the unit sphere
+		 * and calculate its edges and angles */
+		const float3 v0_p = V[0] - P;
+		const float3 v1_p = V[1] - P;
+		const float3 v2_p = V[2] - P;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float3 A = safe_normalize(v0_p);
+		const float3 B = safe_normalize(v1_p);
+		const float3 C = safe_normalize(v2_p);
+
+		const float cos_alpha = dot(u02, u01);
+		const float cos_beta = -dot(u01, u12);
+		const float cos_gamma = dot(u02, u12);
+
+		/* calculate dihedral angles */
+		const float alpha = fast_acosf(cos_alpha);
+		const float beta = fast_acosf(cos_beta);
+		const float gamma = fast_acosf(cos_gamma);
+		/* the area of the unit spherical triangle = solid angle */
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* precompute a few things
+		 * these could be re-used to take several samples
+		 * as they are independent of randu/randv */
+		const float cos_c = dot(A, B);
+		const float sin_alpha = fast_sinf(alpha);
+		const float product = sin_alpha * cos_c;
+
+		/* Select a random sub-area of the spherical triangle
+		 * and calculate the third vertex C_ of that new triangle */
+		const float phi = randu * solid_angle - alpha;
+		float s, t;
+		fast_sincosf(phi, &s, &t);
+		const float u = t - cos_alpha;
+		const float v = s + product;
+
+		const float3 U = safe_normalize(C - dot(C, A) * A);
+
+		float q = 1.0f;
+		const float det = ((v * s + u * t) * sin_alpha);
+		if(det != 0.0f) {
+			q = ((v * t - u * s) * cos_alpha - v) / det;
+		}
+		const float temp = max(1.0f - q*q, 0.0f);
 
-ccl_device float triangle_light_pdf(KernelGlobals *kg,
-	const float3 Ng, const float3 I, float t)
-{
-	float pdf = kernel_data.integrator.pdf_triangles;
-	float cos_pi = fabsf(dot(Ng, I));
+		const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
 
-	if(cos_pi == 0.0f)
-		return 0.0f;
-	
-	return t*t*pdf/cos_pi;
+		/* Finally, select a random point along the edge of the new triangle
+		 * That point on the spherical triangle is the sampled ray direction */
+		const float z = 1.0f - randv * (1.0f - dot(C_, B));
+		ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B);
+
+		/* calculate intersection with the planar triangle */
+		if(!ray_triangle_intersect(P, ls->D, FLT_MAX,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+		                           (ssef*)V,
+#else
+		                           V[0], V[1], V[2],
+#endif
+		                           &ls->u, &ls->v, &ls->t)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
+		ls->P = P + ls->D * ls->t;
+
+		/* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+		else {
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			ls->pdf = pdf / solid_angle;
+		}
+	}
+	else {
+		/* compute random point in triangle */
+		randu = sqrtf(randu);
+
+		const float u = 1.0f - randu;
+		const float v = randv*randu;
+		const float t = 1.0f - u - v;
+		ls->P = u * V[0] + v * V[1] + t * V[2];
+		/* compute incoming direction, distance and pdf */
+		ls->D = normalize_len(ls->P - P, &ls->t);
+		ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t);
+		if(has_motion && area != 0.0f) {
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			ls->pdf = ls->pdf * area_pre / area;
+		}
+		ls->u = u;
+		ls->v = v;
+	}
 }
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
+ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 {
-	/* this is basically std::upper_bound as used by pbrt, to find a point light or
+	/* This is basically std::upper_bound as used by pbrt, to find a point light or
 	 * triangle to emit from, proportional to area. a good improvement would be to
 	 * also sample proportional to power, though it's not so well defined with
-	 * OSL shaders. */
+	 * arbitrary shaders. */
 	int first = 0;
 	int len = kernel_data.integrator.num_distribution + 1;
+	float r = *randu;
 
 	while(len > 0) {
 		int half_len = len >> 1;
 		int middle = first + half_len;
 
-		if(randt < kernel_tex_fetch(__light_distribution, middle).x) {
+		if(r < kernel_tex_fetch(__light_distribution, middle).totarea) {
 			len = half_len;
 		}
 		else {
@@ -843,21 +1058,27 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 		}
 	}
 
-	/* clamping should not be needed but float rounding errors seem to
-	 * make this fail on rare occasions */
-	return clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+	/* Clamping should not be needed but float rounding errors seem to
+	 * make this fail on rare occasions. */
+	int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+
+	/* Rescale to reuse random number. this helps the 2D samples within
+	 * each area light be stratified as well. */
+	float distr_min = kernel_tex_fetch(__light_distribution, index).totarea;
+	float distr_max = kernel_tex_fetch(__light_distribution, index+1).totarea;
+	*randu = (r - distr_min)/(distr_max - distr_min);
+
+	return index;
 }
 
 /* Generic Light */
 
 ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
 {
-	float4 data4 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 4);
-	return (bounce > __float_as_int(data4.x));
+	return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
 
 ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      float randt,
                                       float randu,
                                       float randv,
                                       float time,
@@ -866,20 +1087,17 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
                                       LightSample *ls)
 {
 	/* sample index */
-	int index = light_distribution_sample(kg, randt);
+	int index = light_distribution_sample(kg, &randu);
 
 	/* fetch light data */
-	float4 l = kernel_tex_fetch(__light_distribution, index);
-	int prim = __float_as_int(l.y);
+	const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution, index);
+	int prim = kdistribution->prim;
 
 	if(prim >= 0) {
-		int object = __float_as_int(l.w);
-		int shader_flag = __float_as_int(l.z);
+		int object = kdistribution->mesh_light.object_id;
+		int shader_flag = kdistribution->mesh_light.shader_flag;
 
-		triangle_light_sample(kg, prim, object, randu, randv, time, ls);
-		/* compute incoming direction, distance and pdf */
-		ls->D = normalize_len(ls->P - P, &ls->t);
-		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+		triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
 		ls->shader |= shader_flag;
 		return (ls->pdf > 0.0f);
 	}
@@ -896,8 +1114,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
 
 ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 {
-	float4 data3 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 3);
-	return __float_as_int(data3.x);
+	return kernel_tex_fetch(__lights, index).samples;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 9bee5603474..96391db7649 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -17,11 +17,12 @@
 #ifndef __KERNEL_MATH_H__
 #define __KERNEL_MATH_H__
 
-#include "util_color.h"
-#include "util_math.h"
-#include "util_math_fast.h"
-#include "util_texture.h"
-#include "util_transform.h"
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_math_intersect.h"
+#include "util/util_projection.h"
+#include "util/util_texture.h"
+#include "util/util_transform.h"
 
 #endif /* __KERNEL_MATH_H__ */
-
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index af7b727c1ba..9995490505f 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -67,8 +67,8 @@ ccl_device_inline void sample_cos_hemisphere(const float3 N,
 
 /* sample direction uniformly distributed in hemisphere */
 ccl_device_inline void sample_uniform_hemisphere(const float3 N,
-                                               float randu, float randv,
-                                               float3 *omega_in, float *pdf)
+                                                 float randu, float randv,
+                                                 float3 *omega_in, float *pdf)
 {
 	float z = randu;
 	float r = sqrtf(max(0.0f, 1.0f - z*z));
@@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_hemisphere(const float3 N,
 
 /* sample direction uniformly distributed in cone */
 ccl_device_inline void sample_uniform_cone(const float3 N, float angle,
-                                         float randu, float randv,
-                                         float3 *omega_in, float *pdf)
+                                           float randu, float randv,
+                                           float3 *omega_in, float *pdf)
 {
 	float z = cosf(angle*randu);
 	float r = sqrtf(max(0.0f, 1.0f - z*z));
@@ -187,4 +187,3 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u,
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_MONTECARLO_CL__ */
-
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..a42a8e9812f 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -16,19 +16,23 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
+#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#define __ATOMIC_PASS_WRITE__
+#endif
+
+ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	atomic_add_and_fetch_float(buf, value);
 #else
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -38,13 +42,13 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 	atomic_add_and_fetch_float(buf_z, value.z);
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -56,12 +60,137 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 	atomic_add_and_fetch_float(buf_w, value.w);
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
+}
+
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+{
+	kernel_write_pass_float(buffer, value);
+
+	/* The online one-pass variance update that's used for the megakernel can't easily be implemented
+	 * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+	kernel_write_pass_float(buffer+1, value*value);
+}
+
+#  ifdef __ATOMIC_PASS_WRITE__
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+{
+	buffer[0] += value.x;
+	buffer[1] += value.y;
+	buffer[2] += value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+{
+	kernel_write_pass_float3_unaligned(buffer, value);
+	kernel_write_pass_float3_unaligned(buffer+3, value*value);
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, float path_total, float path_total_shaded)
+{
+	if(kernel_data.film.pass_denoising_data == 0)
+		return;
+
+	buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+	path_total = ensure_finite(path_total);
+	path_total_shaded = ensure_finite(path_total_shaded);
+
+	kernel_write_pass_float(buffer, path_total);
+	kernel_write_pass_float(buffer+1, path_total_shaded);
+
+	float value = path_total_shaded / max(path_total, 1e-7f);
+	kernel_write_pass_float(buffer+2, value*value);
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        ccl_addr_space PathState *state,
+                                                        PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight == 0.0f) {
+		return;
+	}
+
+	L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+	/* Skip implicitly transparent surfaces. */
+	if(sd->flag & SD_HAS_ONLY_VOLUME) {
+		return;
+	}
+
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+	float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			continue;
+
+		/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+		normal += sc->N * sc->sample_weight;
+		sum_weight += sc->sample_weight;
+		if(bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
+			albedo += sc->weight;
+			sum_nonspecular_weight += sc->sample_weight;
+		}
+	}
+
+	/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+	if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+		if(sum_weight != 0.0f) {
+			normal /= sum_weight;
+		}
+		L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+		L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+		state->denoising_feature_weight = 0.0f;
+	}
+#else
+	(void) kg;
+	(void) sd;
+	(void) state;
+	(void) L;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
+#ifdef __KERNEL_DEBUG__
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 PathRadiance *L)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASSMASK(BVH_TRAVERSED_NODES)) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
+		                        L->debug_data.num_bvh_traversed_nodes);
+	}
+	if(flag & PASSMASK(BVH_TRAVERSED_INSTANCES)) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        L->debug_data.num_bvh_traversed_instances);
+	}
+	if(flag & PASSMASK(BVH_INTERSECTIONS)) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
+		                        L->debug_data.num_bvh_intersections);
+	}
+	if(flag & PASSMASK(RAY_BOUNCES)) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        L->debug_data.num_ray_bounces);
+	}
 }
+#endif /* __KERNEL_DEBUG__ */
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
+	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -70,64 +199,64 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 
 	int flag = kernel_data.film.pass_flag;
+	int light_flag = kernel_data.film.light_pass_flag;
 
-	if(!(flag & PASS_ALL))
+	if(!((flag | light_flag) & PASS_ANY))
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
-
-			if(sample == 0) {
-				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, ccl_fetch(sd, P));
-					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
+			if(state->sample == 0) {
+				if(flag & PASSMASK(DEPTH)) {
+					float depth = camera_distance(kg, sd->P);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
 				}
-				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, ccl_fetch(sd, object));
-					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
+				if(flag & PASSMASK(OBJECT_ID)) {
+					float id = object_pass_id(kg, sd->object);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
 				}
-				if(flag & PASS_MATERIAL_ID) {
+				if(flag & PASSMASK(MATERIAL_ID)) {
 					float id = shader_pass_id(kg, sd);
-					kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, sample, id);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
 				}
 			}
 
-			if(flag & PASS_NORMAL) {
-				float3 normal = ccl_fetch(sd, N);
-				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
+			if(flag & PASSMASK(NORMAL)) {
+				float3 normal = shader_bsdf_average_normal(kg, sd);
+				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
 			}
-			if(flag & PASS_UV) {
+			if(flag & PASSMASK(UV)) {
 				float3 uv = primitive_uv(kg, sd);
-				kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, sample, uv);
+				kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
 			}
-			if(flag & PASS_MOTION) {
+			if(flag & PASSMASK(MOTION)) {
 				float4 speed = primitive_motion_vector(kg, sd);
-				kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, sample, speed);
-				kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, sample, 1.0f);
+				kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
+				kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
 			}
 
 			state->flag |= PATH_RAY_SINGLE_PASS_DONE;
 		}
 	}
 
-	if(flag & (PASS_DIFFUSE_INDIRECT|PASS_DIFFUSE_COLOR|PASS_DIFFUSE_DIRECT))
+	if(light_flag & PASSMASK_COMPONENT(DIFFUSE))
 		L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput;
-	if(flag & (PASS_GLOSSY_INDIRECT|PASS_GLOSSY_COLOR|PASS_GLOSSY_DIRECT))
+	if(light_flag & PASSMASK_COMPONENT(GLOSSY))
 		L->color_glossy += shader_bsdf_glossy(kg, sd)*throughput;
-	if(flag & (PASS_TRANSMISSION_INDIRECT|PASS_TRANSMISSION_COLOR|PASS_TRANSMISSION_DIRECT))
+	if(light_flag & PASSMASK_COMPONENT(TRANSMISSION))
 		L->color_transmission += shader_bsdf_transmission(kg, sd)*throughput;
-	if(flag & (PASS_SUBSURFACE_INDIRECT|PASS_SUBSURFACE_COLOR|PASS_SUBSURFACE_DIRECT))
+	if(light_flag & PASSMASK_COMPONENT(SUBSURFACE))
 		L->color_subsurface += shader_bsdf_subsurface(kg, sd)*throughput;
 
-	if(flag & PASS_MIST) {
+	if(light_flag & PASSMASK(MIST)) {
 		/* bring depth into 0..1 range */
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float depth = camera_distance(kg, sd->P);
 		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
@@ -149,53 +278,116 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 #endif
 }
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample)
+ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L)
 {
 #ifdef __PASSES__
-	int flag = kernel_data.film.pass_flag;
+	int light_flag = kernel_data.film.light_pass_flag;
 
 	if(!kernel_data.film.use_light_pass)
 		return;
 	
-	if(flag & PASS_DIFFUSE_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, sample, L->indirect_diffuse);
-	if(flag & PASS_GLOSSY_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, sample, L->indirect_glossy);
-	if(flag & PASS_TRANSMISSION_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, sample, L->indirect_transmission);
-	if(flag & PASS_SUBSURFACE_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, sample, L->indirect_subsurface);
-	if(flag & PASS_DIFFUSE_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, sample, L->direct_diffuse);
-	if(flag & PASS_GLOSSY_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, sample, L->direct_glossy);
-	if(flag & PASS_TRANSMISSION_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, sample, L->direct_transmission);
-	if(flag & PASS_SUBSURFACE_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, sample, L->direct_subsurface);
-
-	if(flag & PASS_EMISSION)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, sample, L->emission);
-	if(flag & PASS_BACKGROUND)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_background, sample, L->background);
-	if(flag & PASS_AO)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, sample, L->ao);
-
-	if(flag & PASS_DIFFUSE_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, sample, L->color_diffuse);
-	if(flag & PASS_GLOSSY_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, sample, L->color_glossy);
-	if(flag & PASS_TRANSMISSION_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, sample, L->color_transmission);
-	if(flag & PASS_SUBSURFACE_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, sample, L->color_subsurface);
-	if(flag & PASS_SHADOW) {
+	if(light_flag & PASSMASK(DIFFUSE_INDIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
+	if(light_flag & PASSMASK(GLOSSY_INDIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
+	if(light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, L->indirect_transmission);
+	if(light_flag & PASSMASK(SUBSURFACE_INDIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, L->indirect_subsurface);
+	if(light_flag & PASSMASK(VOLUME_INDIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_scatter);
+	if(light_flag & PASSMASK(DIFFUSE_DIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
+	if(light_flag & PASSMASK(GLOSSY_DIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
+	if(light_flag & PASSMASK(TRANSMISSION_DIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, L->direct_transmission);
+	if(light_flag & PASSMASK(SUBSURFACE_DIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, L->direct_subsurface);
+	if(light_flag & PASSMASK(VOLUME_DIRECT))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_scatter);
+
+	if(light_flag & PASSMASK(EMISSION))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
+	if(light_flag & PASSMASK(BACKGROUND))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
+	if(light_flag & PASSMASK(AO))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
+
+	if(light_flag & PASSMASK(DIFFUSE_COLOR))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
+	if(light_flag & PASSMASK(GLOSSY_COLOR))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
+	if(light_flag & PASSMASK(TRANSMISSION_COLOR))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, L->color_transmission);
+	if(light_flag & PASSMASK(SUBSURFACE_COLOR))
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface);
+	if(light_flag & PASSMASK(SHADOW)) {
 		float4 shadow = L->shadow;
 		shadow.w = kernel_data.film.pass_shadow_scale;
-		kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, sample, shadow);
+		kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, shadow);
 	}
-	if(flag & PASS_MIST)
-		kernel_write_pass_float(buffer + kernel_data.film.pass_mist, sample, 1.0f - L->mist);
+	if(light_flag & PASSMASK(MIST))
+		kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
+#endif
+}
+
+ccl_device_inline void kernel_write_result(KernelGlobals *kg,
+                                           ccl_global float *buffer,
+                                           int sample,
+                                           PathRadiance *L)
+{
+	float alpha;
+	float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
+
+	kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+	kernel_write_light_passes(kg, buffer, L);
+
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+#  ifdef __SHADOW_TRICKS__
+		kernel_write_denoising_shadow(kg,
+		                              buffer + kernel_data.film.pass_denoising_data,
+		                              sample,
+		                              average(L->path_total),
+		                              average(L->path_total_shaded));
+#  else
+		kernel_write_denoising_shadow(kg,
+		                              buffer + kernel_data.film.pass_denoising_data,
+		                              sample,
+		                              0.0f, 0.0f);
+#  endif
+		if(kernel_data.film.pass_denoising_clean) {
+			float3 noisy, clean;
+			path_radiance_split_denoising(kg, L, &noisy, &clean);
+			kernel_write_pass_float3_variance(
+			        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			        noisy);
+			kernel_write_pass_float3_unaligned(
+			        buffer + kernel_data.film.pass_denoising_clean,
+			        clean);
+		}
+		else {
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			                                    ensure_finite3(L_sum));
+		}
+
+		kernel_write_pass_float3_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+		        L->denoising_normal);
+		kernel_write_pass_float3_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+		        L->denoising_albedo);
+		kernel_write_pass_float_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+		        L->denoising_depth);
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, L);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index f90701a8260..b0f53aef2d5 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -15,57 +15,350 @@
  */
 
 #ifdef __OSL__
-#  include "osl_shader.h"
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
-#ifdef __SUBSURFACE__
-#  include "kernel_subsurface.h"
+#if defined(__VOLUME__) || defined(__SUBSURFACE__)
+#  include "kernel/kernel_volume.h"
 #endif
 
-#ifdef __VOLUME__
-#  include "kernel_volume.h"
+#ifdef __SUBSURFACE__
+#  include "kernel/kernel_subsurface.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline bool kernel_path_scene_intersect(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	Intersection *isect,
+	PathRadiance *L)
+{
+	uint visibility = path_state_ray_visibility(kg, state);
+
+	if(path_state_ao_bounce(kg, state)) {
+		visibility = PATH_RAY_SHADOW;
+		ray->t = kernel_data.background.ao_distance;
+	}
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray->dD.dx + ray->dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init_addrspace(state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
-#  include "kernel_debug.h"
-#endif
+	if(state->flag & PATH_RAY_CAMERA) {
+		L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes;
+		L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances;
+		L->debug_data.num_bvh_intersections += isect->num_intersections;
+	}
+	L->debug_data.num_ray_bounces++;
+#endif  /* __KERNEL_DEBUG__ */
 
-CCL_NAMESPACE_BEGIN
+	return hit;
+}
+
+ccl_device_forceinline void kernel_path_lamp_emission(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	float3 throughput,
+	ccl_addr_space Intersection *isect,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+#ifdef __LAMP_MIS__
+	if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
+		/* ray starting from previous non-transparent bounce */
+		Ray light_ray;
+
+		light_ray.P = ray->P - state->ray_t*ray->D;
+		state->ray_t += isect->t;
+		light_ray.D = ray->D;
+		light_ray.t = state->ray_t;
+		light_ray.time = ray->time;
+		light_ray.dD = ray->dD;
+		light_ray.dP = ray->dP;
+
+		/* intersect with lamp */
+		float3 emission;
+
+		if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
+			path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __LAMP_MIS__ */
+}
+
+ccl_device_forceinline void kernel_path_background(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *sd,
+	PathRadiance *L)
+{
+	/* eval background shader if nothing hit */
+	if(kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+		L->transparent += average(throughput);
+
+#ifdef __PASSES__
+		if(!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
+#endif  /* __PASSES__ */
+			return;
+	}
+
+	/* When using the ao bounces approximation, adjust background
+	 * shader intensity with ao factor. */
+	if(path_state_ao_bounce(kg, state)) {
+		throughput *= kernel_data.background.ao_bounces_factor;
+	}
+
+#ifdef __BACKGROUND__
+	/* sample background shader */
+	float3 L_background = indirect_background(kg, sd, state, ray);
+	path_radiance_accum_background(L, state, throughput, L_background);
+#endif  /* __BACKGROUND__ */
+}
+
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return VOLUME_PATH_ATTENUATED;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+	bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
+	bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+
+	if(decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		volume_segment.sampling_method = sampling_method;
+
+		/* emission */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+
+		/* scattering */
+		VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			int all = kernel_data.integrator.sample_all_lights_indirect;
+
+			/* direct light sampling */
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect sample. if we use distance sampling and take just
+			 * one sample for direct and indirect light, we could share
+			 * this computation, but makes code a bit complex */
+			float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+			float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+
+			result = kernel_volume_decoupled_scatter(kg,
+				state, &volume_ray, sd, throughput,
+				rphase, rscatter, &volume_segment, NULL, true);
+		}
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
+		}
+		else {
+			*throughput *= volume_segment.accum_transmittance;
+		}
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+			/* indirect light bounce */
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
+		}
+#  endif  /* __VOLUME_SCATTER__ */
+	}
+
+	return VOLUME_PATH_ATTENUATED;
+}
+#endif  /* __VOLUME__ */
+
+#endif /* __SPLIT_KERNEL__ */
+
+ccl_device_forceinline bool kernel_path_shader_apply(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *emission_sd,
+	PathRadiance *L,
+	ccl_global float *buffer)
+{
+#ifdef __SHADOW_TRICKS__
+	if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+		if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
+			state->flag |= (PATH_RAY_SHADOW_CATCHER |
+						   PATH_RAY_STORE_SHADOW_INFO);
+
+			float3 bg = make_float3(0.0f, 0.0f, 0.0f);
+			if(!kernel_data.background.transparent) {
+				bg = indirect_background(kg, emission_sd, state, ray);
+			}
+			path_radiance_accum_shadowcatcher(L, throughput, bg);
+		}
+	}
+	else if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		/* Only update transparency after shadow catcher bounce. */
+		L->shadow_transparency *=
+				average(shader_bsdf_transparency(kg, sd));
+	}
+#endif  /* __SHADOW_TRICKS__ */
+
+	/* holdout */
+#ifdef __HOLDOUT__
+	if(((sd->flag & SD_HOLDOUT) ||
+		(sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+	   (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND))
+	{
+		if(kernel_data.background.transparent) {
+			float3 holdout_weight;
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+				holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+			}
+			else {
+				holdout_weight = shader_holdout_eval(kg, sd);
+			}
+			/* any throughput is ok, should all be identical here */
+			L->transparent += average(holdout_weight*throughput);
+		}
+
+		if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+			return false;
+		}
+	}
+#endif  /* __HOLDOUT__ */
+
+	/* holdout mask objects do not write data passes */
+	kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
+
+	/* blurring of bsdf after bounces, for rays that have a small likelihood
+	 * of following this particular path (diffuse, rough glossy) */
+	if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+		float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+		if(blur_pdf < 1.0f) {
+			float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+			shader_bsdf_blur(kg, sd, blur_roughness);
+		}
+	}
+
+#ifdef __EMISSION__
+	/* emission */
+	if(sd->flag & SD_EMISSION) {
+		float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf);
+		path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __EMISSION__ */
+
+	return true;
+}
 
 ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
-                                        RNG *rng,
+                                        ccl_addr_space PathState *state,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
 	/* todo: solve correlation */
 	float bsdf_u, bsdf_v;
 
-	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 	float ao_factor = kernel_data.background.ao_factor;
 	float3 ao_N;
@@ -75,278 +368,118 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
-#endif  /* __OBJECT_MOTION__ */
-		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.time = sd->time;
+		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
+			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
+		}
+		else {
+			path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
-                                     RNG *rng,
                                      Ray *ray,
                                      float3 throughput,
-                                     int num_samples,
                                      PathState *state,
                                      PathRadiance *L)
 {
+#ifdef __SUBSURFACE__
+	SubsurfaceIndirectRays ss_indirect;
+	kernel_path_subsurface_init_indirect(&ss_indirect);
+
+	for(;;) {
+#endif  /* __SUBSURFACE__ */
+
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, state);
-		if(state->bounce > kernel_data.integrator.ao_bounces) {
-			visibility = PATH_RAY_SHADOW;
-			ray->t = kernel_data.background.ao_distance;
-		}
-		bool hit = scene_intersect(kg,
-		                           *ray,
-		                           visibility,
-		                           &isect,
-		                           NULL,
-		                           0.0f, 0.0f);
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray->P - state->ray_t*ray->D;
-			state->ray_t += isect.t;
-			light_ray.D = ray->D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray->time;
-			light_ray.dD = ray->dD;
-			light_ray.dP = ray->dP;
-
-			/* intersect with lamp */
-			float3 emission;
-			if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L,
-				                             throughput,
-				                             emission,
-				                             state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state->volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state->volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = *ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous =
-			        volume_stack_is_heterogeneous(kg,
-			                                      state->volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method =
-			        volume_stack_sampling_method(kg,
-			                                     state->volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg,
-				                         sd,
-				                         &volume_ray);
-				kernel_volume_decoupled_record(kg,
-				                               state,
-				                               &volume_ray,
-				                               sd,
-				                               &volume_segment,
-				                               heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION) {
-					path_radiance_accum_emission(L,
-					                             throughput,
-					                             volume_segment.accum_emission,
-					                             state->bounce);
-				}
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = kernel_data.integrator.sample_all_lights_indirect;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg,
-					                                          rng,
-					                                          sd,
-					                                          emission_sd,
-					                                          throughput,
-					                                          state,
-					                                          L,
-					                                          all,
-					                                          &volume_ray,
-					                                          &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-					                                         state,
-					                                         &volume_ray,
-					                                         sd,
-					                                         &throughput,
-					                                         rphase,
-					                                         rscatter,
-					                                         &volume_segment,
-					                                         NULL,
-					                                         true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg,
-					                                 rng,
-					                                 sd,
-					                                 emission_sd,
-					                                 throughput,
-					                                 state,
-					                                 L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
+		}
+		else if(result == VOLUME_PATH_MISSED) {
+			break;
 		}
-#endif  /* __VOLUME__ */
+#endif /* __VOLUME__*/
 
+		/* Shade background. */
 		if(!hit) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, emission_sd, state, ray);
-			path_radiance_accum_background(L,
-			                               throughput,
-			                               L_background,
-			                               state->bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, state, ray, throughput, sd, L);
 			break;
 		}
-		else if(state->bounce > kernel_data.integrator.ao_bounces) {
+		else if(path_state_ao_bounce(kg, state)) {
 			break;
 		}
 
-		/* setup shading */
-		shader_setup_from_ray(kg,
-		                      sd,
-		                      &isect,
-		                      ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
-#ifdef __BRANCHED_PATH__
-		shader_merge_closures(sd);
-#endif  /* __BRANCHED_PATH__ */
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
+		/* Setup shader data. */
+		shader_setup_from_ray(kg, sd, &isect, ray);
 
-#ifdef __EMISSION__
-		/* emission */
-		if(sd->flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg,
-			                                              sd,
-			                                              isect.t,
-			                                              state->flag,
-			                                              state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		/* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+		if(!(sd->flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+		/* Evaluate shader. */
+		shader_eval_surface(kg, sd, state, state->flag);
+		shader_prepare_closures(sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             NULL))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability =
-		        path_state_terminate_probability(kg,
-		                                         state,
-		                                         throughput*num_samples);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -354,50 +487,39 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
 #endif  /* __AO__ */
 
+
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd->flag & SD_BSSRDF) {
-			float bssrdf_probability;
-			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
-
-			/* modify throughput for picking bssrdf or bsdf */
-			throughput *= bssrdf_probability;
-
-			/* do bssrdf scatter step if we picked a bssrdf closure */
-			if(sc) {
-				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg,
-				                  rng,
-				                  state,
-				                  PRNG_BSDF_U,
-				                  &bssrdf_u, &bssrdf_v);
-				subsurface_scatter_step(kg,
-				                        sd,
-				                        state,
-				                        state->flag,
-				                        sc,
-				                        &lcg_state,
-				                        bssrdf_u, bssrdf_v,
-				                        false);
+			if(kernel_path_subsurface_scatter(kg,
+			                                  sd,
+			                                  emission_sd,
+			                                  L,
+			                                  state,
+			                                  ray,
+			                                  &throughput,
+			                                  &ss_indirect))
+			{
+				break;
 			}
 		}
 #endif  /* __SUBSURFACE__ */
 
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
 		if(kernel_data.integrator.use_direct_light) {
-			int all = kernel_data.integrator.sample_all_lights_indirect;
+			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
+			          (state->flag & PATH_RAY_SHADOW_CATCHER);
 			kernel_branched_path_surface_connect_light(kg,
-			                                           rng,
 			                                           sd,
 			                                           emission_sd,
 			                                           state,
@@ -406,205 +528,48 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif  /* defined(__EMISSION__) */
 
-		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
+#ifdef __VOLUME__
+		}
+#endif
+
+		if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
 			break;
 	}
-}
 
 #ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-bool kernel_path_subsurface_scatter(
-        KernelGlobals *kg,
-        ShaderData *sd,
-        ShaderData *emission_sd,
-        PathRadiance *L,
-        PathState *state,
-        RNG *rng,
-        Ray *ray,
-        float3 *throughput,
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	float bssrdf_probability;
-	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
-
-	/* modify throughput for picking bssrdf or bsdf */
-	*throughput *= bssrdf_probability;
-
-	/* do bssrdf scatter step if we picked a bssrdf closure */
-	if(sc) {
-		/* We should never have two consecutive BSSRDF bounces,
-		 * the second one should be converted to a diffuse BSDF to
-		 * avoid this.
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
+		 * stack memory than invoking kernel_path_indirect.
 		 */
-		kernel_assert(!ss_indirect->tracing);
-
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-		SubsurfaceIntersection ss_isect;
-		float bssrdf_u, bssrdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-		int num_hits = subsurface_scatter_multi_intersect(kg,
-		                                                  &ss_isect,
-		                                                  sd,
-		                                                  sc,
-		                                                  &lcg_state,
-		                                                  bssrdf_u, bssrdf_v,
-		                                                  false);
-#  ifdef __VOLUME__
-		ss_indirect->need_update_volume_stack =
-		        kernel_data.integrator.use_volumes &&
-		        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif  /* __VOLUME__ */
-
-		/* compute lighting with the BSDF closure */
-		for(int hit = 0; hit < num_hits; hit++) {
-			/* NOTE: We reuse the existing ShaderData, we assume the path
-			 * integration loop stops when this function returns true.
-			 */
-			subsurface_scatter_multi_setup(kg,
-			                               &ss_isect,
-			                               hit,
-			                               sd,
-			                               state,
-			                               state->flag,
-			                               sc,
-			                               false);
-
-			PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
-
-			*hit_state = *state;
-			*hit_ray = *ray;
-			*hit_tp = *throughput;
-
-			hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
-			hit_L->direct_throughput = L->direct_throughput;
-			path_radiance_copy_indirect(hit_L, L);
-
-			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
-			if(kernel_path_surface_bounce(kg,
-			                              rng,
-			                              sd,
-			                              hit_tp,
-			                              hit_state,
-			                              hit_L,
-			                              hit_ray))
-			{
-#  ifdef __LAMP_MIS__
-				hit_state->ray_t = 0.0f;
-#  endif  /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-				if(ss_indirect->need_update_volume_stack) {
-					Ray volume_ray = *ray;
-					/* Setup ray from previous surface point to the new one. */
-					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_volume_stack_update_for_subsurface(
-					    kg,
-					    emission_sd,
-					    &volume_ray,
-					    hit_state->volume_stack);
-				}
-#  endif  /* __VOLUME__ */
-				path_radiance_reset_indirect(L);
-				ss_indirect->num_rays++;
-			}
-			else {
-				path_radiance_accum_sample(L, hit_L, 1);
-			}
+		if(ss_indirect.num_rays) {
+			kernel_path_subsurface_setup_indirect(kg,
+			                                      &ss_indirect,
+			                                      state,
+			                                      ray,
+			                                      L,
+			                                      &throughput);
 		}
-		return true;
-	}
-	return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	ss_indirect->tracing = false;
-	ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_accum_indirect(
-        SubsurfaceIndirectRays *ss_indirect,
-        PathRadiance *L)
-{
-	if(ss_indirect->tracing) {
-		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
-		if(ss_indirect->num_rays == 0) {
-			*L = ss_indirect->direct_L;
+		else {
+			break;
 		}
 	}
+#endif  /* __SUBSURFACE__ */
 }
 
-ccl_device void kernel_path_subsurface_setup_indirect(
-        KernelGlobals *kg,
-        SubsurfaceIndirectRays *ss_indirect,
-        PathState *state,
-        Ray *ray,
-        PathRadiance *L,
-        float3 *throughput)
-{
-	if(!ss_indirect->tracing) {
-		ss_indirect->direct_L = *L;
-	}
-	ss_indirect->tracing = true;
-
-	/* Setup state, ray and throughput for indirect SSS rays. */
-	ss_indirect->num_rays--;
-
-	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
-	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
-
-	*state = ss_indirect->state[ss_indirect->num_rays];
-	*ray = *indirect_ray;
-	*L = *indirect_L;
-	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif  /* __SUBSURFACE__ */
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
-                                               RNG *rng,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer)
+ccl_device_forceinline void kernel_path_integrate(
+	KernelGlobals *kg,
+	PathState *state,
+	float3 throughput,
+	Ray *ray,
+	PathRadiance *L,
+	ccl_global float *buffer,
+	ShaderData *emission_sd)
 {
-	/* initialize */
-	PathRadiance L;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
-
-	/* shader data memory used for both volumes and surfaces, saves stack space */
+	/* Shader data memory used for both volumes and surfaces, saves stack space. */
 	ShaderData sd;
-	/* shader data used by emission, shadows, volume stacks */
-	ShaderData emission_sd;
-
-	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
@@ -615,248 +580,89 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		if(state.bounce > kernel_data.integrator.ao_bounces) {
-			visibility = PATH_RAY_SHADOW;
-			ray.t = kernel_data.background.ao_distance;
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		if(state.flag & PATH_RAY_CAMERA) {
-			debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
-			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-			debug_data.num_bvh_intersections += isect.num_intersections;
-		}
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
-
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state.ray_t*ray.D;
-			state.ray_t += isect.t;
-			light_ray.D = ray.D;
-			light_ray.t = state.ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state.volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg, &sd, &volume_ray);
-				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &sd, &volume_segment, heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = false;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, &L, all,
-						&volume_ray, &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &sd, &throughput,
-						rphase, rscatter, &volume_segment, NULL, true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   &sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
+		}
+		else if(result == VOLUME_PATH_MISSED) {
+			break;
 		}
-#endif  /* __VOLUME__ */
+#endif /* __VOLUME__*/
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, state, ray, throughput, &sd, L);
 			break;
 		}
-		else if(state.bounce > kernel_data.integrator.ao_bounces) {
+		else if(path_state_ao_bounce(kg, state)) {
 			break;
 		}
 
-		/* setup shading */
-		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(((sd.flag & SD_HOLDOUT) ||
-		    (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-		   (state.flag & PATH_RAY_CAMERA))
-		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, &sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
+		/* Setup shader data. */
+		shader_setup_from_ray(kg, &sd, &isect, ray);
 
-			if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				break;
-			}
-		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, &sd, blur_roughness);
-			}
-		}
+		/* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
 
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			/* todo: is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		/* Evaluate shader. */
+		shader_eval_surface(kg, &sd, state, state->flag);
+		shader_prepare_closures(&sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 			if(terminate >= probability)
 				break;
 
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, &sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */
 
@@ -866,11 +672,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_BSSRDF) {
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
-			                                  &emission_sd,
-			                                  &L,
-			                                  &state,
-			                                  rng,
-			                                  &ray,
+			                                  emission_sd,
+			                                  L,
+			                                  state,
+			                                  ray,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
@@ -880,25 +685,27 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 
 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
+
+#ifdef __VOLUME__
+		}
+#endif
 
 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+		if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
 			break;
 	}
 
 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
-
 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
 		 */
 		if(ss_indirect.num_rays) {
 			kernel_path_subsurface_setup_indirect(kg,
 			                                      &ss_indirect,
-			                                      &state,
-			                                      &ray,
-			                                      &L,
+			                                      state,
+			                                      ray,
+			                                      L,
 			                                      &throughput);
 		}
 		else {
@@ -906,48 +713,53 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		}
 	}
 #endif  /* __SUBSURFACE__ */
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 ccl_device void kernel_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
+	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
 
-	rng_state += index;
 	buffer += index*pass_stride;
 
-	/* initialize random numbers and ray */
-	RNG rng;
+	/* Initialize random numbers and sample ray. */
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
-	/* integrate */
-	float4 L;
+	if(ray.t == 0.0f) {
+		return;
+	}
 
-	if(ray.t != 0.0f)
-		L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	/* Initialize state. */
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	PathRadiance L;
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+	ShaderDataTinyStorage emission_sd_storage;
+	ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
 
-	path_rng_end(kg, rng_state, rng);
+	PathState state;
+	path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
+
+	/* Integrate. */
+	kernel_path_integrate(kg,
+	                      &state,
+	                      throughput,
+	                      &ray,
+	                      &L,
+	                      buffer,
+	                      emission_sd);
+
+	kernel_write_result(kg, buffer, sample, &L);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ff2b828795d..66f67c3e2c4 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,8 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
-                                               RNG *rng,
+                                               ccl_addr_space PathState *state,
                                                float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
@@ -35,46 +34,225 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 	for(int j = 0; j < num_samples; j++) {
 		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 		float3 ao_D;
 		float ao_pdf;
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
-#endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.time = sd->time;
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
+			}
+			else {
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
+			}
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline void kernel_branched_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *indirect_sd,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	/* decoupled ray marching only supported on CPU */
+	if(kernel_data.integrator.volume_decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		/* direct light sampling */
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+
+			int all = kernel_data.integrator.sample_all_lights_direct;
+
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect light sampling */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = *state;
+				Ray pray = *ray;
+				float3 tp = *throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				/* scatter sample. if we use distance sampling and take just one
+				 * sample for direct and indirect light, we could share this
+				 * computation, but makes code a bit complex */
+				float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+				float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					&ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+				if(result == VOLUME_PATH_SCATTERED &&
+				   kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp*num_samples_inv,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+		}
+
+		/* emission and transmittance */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+		*throughput *= volume_segment.accum_transmittance;
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* GPU: no decoupled ray marching, scatter probalistically */
+		int num_samples = kernel_data.integrator.volume_samples;
+		float num_samples_inv = 1.0f/num_samples;
+
+		/* todo: we should cache the shader evaluations from stepping
+		 * through the volume, for now we redo them multiple times */
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			Ray pray = *ray;
+			float3 tp = (*throughput) * num_samples_inv;
+
+			/* branch RNG state */
+			path_state_branch(&ps, j, num_samples);
+
+			VolumeIntegrateResult result = kernel_volume_integrate(
+				kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+			if(result == VOLUME_PATH_SCATTERED) {
+				/* todo: support equiangular, MIS and all light sampling.
+				 * alternatively get decoupled ray marching working on the GPU */
+				kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
+
+				if(kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+# endif  /* __VOLUME_SCATTER__ */
+		}
+
+		/* todo: avoid this calculation using decoupled ray marching */
+		kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
+	}
+}
+#endif  /* __VOLUME__ */
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+	ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}
 
 		int num_samples;
 
@@ -90,34 +268,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
 			PathState ps = *state;
 			float3 tp = throughput;
 			Ray bsdf_ray;
+#ifdef __SHADOW_TRICKS__
+			float shadow_transparency = L->shadow_transparency;
+#endif
+
+			ps.rng_hash = cmj_hash(state->rng_hash, i);
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        &tp,
 			                                        &ps,
-			                                        L,
-			                                        &bsdf_ray))
+			                                        &L->state,
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps.rng_hash = state->rng_hash;
+
 			kernel_path_indirect(kg,
 			                     indirect_sd,
 			                     emission_sd,
-			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
-			                     num_samples,
 			                     &ps,
 			                     L);
 
@@ -125,6 +307,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			 * for the next samples */
 			path_radiance_sum_indirect(L);
 			path_radiance_reset_indirect(L);
+
+#ifdef __SHADOW_TRICKS__
+			L->shadow_transparency = shadow_transparency;
+#endif
 		}
 	}
 }
@@ -136,40 +322,47 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         ShaderData *emission_sd,
                                                         PathRadiance *L,
                                                         PathState *state,
-                                                        RNG *rng,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-		int num_samples = kernel_data.integrator.subsurface_samples;
+		uint lcg_state = lcg_state_init(state, 0x68bc21eb);
+		int num_samples = kernel_data.integrator.subsurface_samples * 3;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
+		uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
 		for(int j = 0; j < num_samples; j++) {
-			SubsurfaceIntersection ss_isect;
+			PathState hit_state = *state;
+			path_state_branch(&hit_state, j, num_samples);
+			hit_state.rng_hash = bssrdf_rng_hash;
+
+			LocalIntersection ss_isect;
 			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 			int num_hits = subsurface_scatter_multi_intersect(kg,
 			                                                  &ss_isect,
 			                                                  sd,
+			                                                  &hit_state,
 			                                                  sc,
 			                                                  &lcg_state,
 			                                                  bssrdf_u, bssrdf_v,
 			                                                  true);
+
+			hit_state.rng_offset += PRNG_BOUNCE_NUM;
+
 #ifdef __VOLUME__
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack =
 			        kernel_data.integrator.use_volumes &&
-			        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
@@ -179,14 +372,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				                               &ss_isect,
 				                               hit,
 				                               &bssrdf_sd,
-				                               state,
-				                               state->flag,
-				                               sc,
-				                               true);
-
-				PathState hit_state = *state;
-
-				path_state_branch(&hit_state, j, num_samples);
+				                               &hit_state,
+				                               sc);
 
 #ifdef __VOLUME__
 				if(need_update_volume_stack) {
@@ -195,6 +382,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					volume_ray.D = normalize_len(P - volume_ray.P,
 					                             &volume_ray.t);
 
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state.volume_stack[k] = state->volume_stack[k];
+					}
+
 					kernel_volume_stack_update_for_subsurface(
 					    kg,
 					    emission_sd,
@@ -206,10 +397,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __EMISSION__
 				/* direct light */
 				if(kernel_data.integrator.use_direct_light) {
-					int all = kernel_data.integrator.sample_all_lights_direct;
+					int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					          (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
-					        rng,
 					        &bssrdf_sd,
 					        emission_sd,
 					        &hit_state,
@@ -223,7 +414,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
 				        kg,
-				        rng,
 				        &bssrdf_sd,
 				        indirect_sd,
 				        emission_sd,
@@ -237,284 +427,93 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
+                                               uint rng_hash,
+                                               int sample,
+                                               Ray ray,
+                                               ccl_global float *buffer,
+                                               PathRadiance *L)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
 	/* shader data used by emission, shadows, volume stacks, indirect path */
-	ShaderData emission_sd, indirect_sd;
+	ShaderDataTinyStorage emission_sd_storage;
+	ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+	ShaderData indirect_sd;
 
 	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
+	path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
 	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
 	 */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if(kernel_data.cam.resolution == 1) {
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
-		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data.num_bvh_intersections += isect.num_intersections;
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
+		bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state.volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-
-			shader_setup_from_volume(kg, &sd, &volume_ray);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				int all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
-					&volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp*num_samples_inv,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
-
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif  /* __VOLUME_SCATTER__ */
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif  /* __VOLUME_DECOUPLED__ */
-		}
+		/* Volume integration. */
+		kernel_branched_path_volume(kg,
+		                            &sd,
+		                            &state,
+		                            &ray,
+		                            &throughput,
+		                            &isect,
+		                            hit,
+		                            &indirect_sd,
+		                            emission_sd,
+		                            L);
 #endif  /* __VOLUME__ */
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, &state, &ray, throughput, &sd, L);
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
-		shader_merge_closures(&sd);
 
-		/* holdout */
-#ifdef __HOLDOUT__
-		if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, &sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-			if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				break;
-			}
-		}
-#endif  /* __HOLDOUT__ */
+		/* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
 
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		shader_eval_surface(kg, &sd, &state, state.flag);
+		shader_merge_closures(&sd);
 
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             &state,
+		                             &ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
 			/* path termination. this is a strange place to put the termination, it's
 			 * mainly due to the mixed in MIS that we use. gives too many unneeded
 			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
+			float probability = path_state_continuation_probability(kg, &state, throughput);
 
 			if(probability == 0.0f) {
 				break;
 			}
 			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
 
 				if(terminate >= probability)
 					break;
@@ -523,52 +522,60 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
 		}
 #endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
-			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, emission_sd,
+			                                        L, &state, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
-		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-			PathState hit_state = state;
+		PathState hit_state = state;
 
 #ifdef __EMISSION__
-			/* direct light */
-			if(kernel_data.integrator.use_direct_light) {
-				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
-			}
+		/* direct light */
+		if(kernel_data.integrator.use_direct_light) {
+			int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					  (state.flag & PATH_RAY_SHADOW_CATCHER);
+			kernel_branched_path_surface_connect_light(kg,
+				&sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
+		}
 #endif  /* __EMISSION__ */
 
-			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+		/* indirect light */
+		kernel_branched_path_surface_indirect_light(kg,
+			&sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
 
-			/* continue in case of transparency */
-			throughput *= shader_bsdf_transparency(kg, &sd);
+		/* continue in case of transparency */
+		throughput *= shader_bsdf_transparency(kg, &sd);
 
-			if(is_zero(throughput))
-				break;
-		}
+		if(is_zero(throughput))
+			break;
 
 		/* Update Path State */
-		state.flag |= PATH_RAY_TRANSPARENT;
-		state.transparent_bounce++;
+		path_state_next(kg, &state, LABEL_TRANSPARENT);
+
+#ifdef __VOLUME__
+		}
+		else {
+			if(!path_state_volume_next(kg, &state)) {
+				break;
+			}
+		}
+#endif
 
 		ray.P = ray_offset(sd.P, -sd.Ng);
 		ray.t -= sd.ray_length; /* clipping works through transparent */
 
-
 #ifdef __RAY_DIFFERENTIALS__
 		ray.dP = sd.dP;
 		ray.dD.dx = -sd.dI.dx;
@@ -580,50 +587,35 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
 #endif  /* __VOLUME__ */
 	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
+	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
 
-	rng_state += index;
 	buffer += index*pass_stride;
 
 	/* initialize random numbers and ray */
-	RNG rng;
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	PathRadiance L;
 
-	path_rng_end(kg, rng_state, rng);
+	if(ray.t != 0.0f) {
+		kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
+		kernel_write_result(kg, buffer, sample, &L);
+	}
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 7b903556bf9..d83fd474cde 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#include "util_hash.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
-                                               ccl_global uint *rng_state,
                                                int sample,
                                                int x, int y,
-                                               ccl_addr_space RNG *rng,
+                                               uint *rng_hash,
                                                ccl_addr_space Ray *ray)
 {
 	float filter_u;
@@ -30,24 +29,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
 
 	int num_samples = kernel_data.integrator.aa_samples;
 
-	if(sample == kernel_data.integrator.start_sample) {
-		*rng_state = hash_int_2d(x, y);
-	}
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+	path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
 
 	/* sample camera ray */
 
 	float lens_u = 0.0f, lens_v = 0.0f;
 
 	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
 
 	float time = 0.0f;
 
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+		time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
 #endif
 
 	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 661dc52fb31..8a358e51f94 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init(KernelGlobals *kg,
                                        ShaderData *stack_sd,
                                        ccl_addr_space PathState *state,
-                                       ccl_addr_space RNG *rng,
+                                       uint rng_hash,
                                        int sample,
                                        ccl_addr_space Ray *ray)
 {
-	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
+	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP|PATH_RAY_TRANSPARENT_BACKGROUND;
 
+	state->rng_hash = rng_hash;
 	state->rng_offset = PRNG_BASE_NUM;
 	state->sample = sample;
 	state->num_samples = kernel_data.integrator.aa_samples;
+	state->branch_factor = 1.0f;
 
 	state->bounce = 0;
 	state->diffuse_bounce = 0;
@@ -35,6 +37,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 	state->transmission_bounce = 0;
 	state->transparent_bounce = 0;
 
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+		state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+		state->denoising_feature_weight = 1.0f;
+	}
+	else {
+		state->denoising_feature_weight = 0.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	state->min_ray_pdf = FLT_MAX;
 	state->ray_pdf = 0.0f;
 #ifdef __LAMP_MIS__
@@ -43,12 +55,11 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 
 #ifdef __VOLUME__
 	state->volume_bounce = 0;
+	state->volume_bounds_bounce = 0;
 
 	if(kernel_data.integrator.use_volumes) {
 		/* Initialize volume stack with volume we are inside of. */
 		kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
-		/* Seed RNG for cases where we can't use stratified samples .*/
-		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
 	}
 	else {
 		state->volume_stack[0].shader = SHADER_NONE;
@@ -63,25 +74,36 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 	if(label & LABEL_TRANSPARENT) {
 		state->flag |= PATH_RAY_TRANSPARENT;
 		state->transparent_bounce++;
-
-		/* don't increase random number generator offset here, to avoid some
-		 * unwanted patterns, see path_state_rng_1D_for_decision */
+		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+			state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+		}
 
 		if(!kernel_data.integrator.transparent_shadows)
 			state->flag |= PATH_RAY_MIS_SKIP;
 
+		/* random number generator next bounce */
+		state->rng_offset += PRNG_BOUNCE_NUM;
+
 		return;
 	}
 
 	state->bounce++;
+	if(state->bounce >= kernel_data.integrator.max_bounce) {
+		state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+	}
+
+	state->flag &= ~(PATH_RAY_ALL_VISIBILITY|PATH_RAY_MIS_SKIP);
 
 #ifdef __VOLUME__
 	if(label & LABEL_VOLUME_SCATTER) {
 		/* volume scatter */
 		state->flag |= PATH_RAY_VOLUME_SCATTER;
-		state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_TRANSMIT|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT|PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
+		state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
 		state->volume_bounce++;
+		if(state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+			state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+		}
 	}
 	else
 #endif
@@ -89,44 +111,79 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 		/* surface reflection/transmission */
 		if(label & LABEL_REFLECT) {
 			state->flag |= PATH_RAY_REFLECT;
-			state->flag &= ~(PATH_RAY_TRANSMIT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
+			state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
-			if(label & LABEL_DIFFUSE)
+			if(label & LABEL_DIFFUSE) {
 				state->diffuse_bounce++;
-			else
+				if(state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+					state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+				}
+			}
+			else {
 				state->glossy_bounce++;
+				if(state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+					state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+				}
+			}
 		}
 		else {
 			kernel_assert(label & LABEL_TRANSMIT);
 
 			state->flag |= PATH_RAY_TRANSMIT;
-			state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
+
+			if(!(label & LABEL_TRANSMIT_TRANSPARENT)) {
+				state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+			}
 
 			state->transmission_bounce++;
+			if(state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+				state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+			}
 		}
 
 		/* diffuse/glossy/singular */
 		if(label & LABEL_DIFFUSE) {
 			state->flag |= PATH_RAY_DIFFUSE|PATH_RAY_DIFFUSE_ANCESTOR;
-			state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else if(label & LABEL_GLOSSY) {
 			state->flag |= PATH_RAY_GLOSSY;
-			state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else {
 			kernel_assert(label & LABEL_SINGULAR);
-
 			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
-			state->flag &= ~PATH_RAY_DIFFUSE;
 		}
 	}
 
 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __DENOISING_FEATURES__
+	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+	}
+#endif
 }
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
+#ifdef __VOLUME__
+ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+{
+	/* For volume bounding meshes we pass through without counting transparent
+	 * bounces, only sanity check in case self intersection gets us stuck. */
+	state->volume_bounds_bounce++;
+	if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+		return false;
+	}
+
+	/* Random number generator next bounce. */
+	if(state->volume_bounds_bounce > 1) {
+		state->rng_offset += PRNG_BOUNCE_NUM;
+	}
+
+	return true;
+}
+#endif
+
+ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state)
 {
 	uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
 
@@ -140,34 +197,42 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
+                                                            ccl_addr_space PathState *state,
+                                                            const float3 throughput)
 {
-	if(state->flag & PATH_RAY_TRANSPARENT) {
-		/* transparent rays treated separately */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
-			return 0.0f;
-		else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce)
+	if(state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
+		/* Ray is to be terminated immediately. */
+		return 0.0f;
+	}
+	else if(state->flag & PATH_RAY_TRANSPARENT) {
+		/* Do at least one bounce without RR. */
+		if(state->transparent_bounce <= 1) {
+			return 1.0f;
+		}
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
 			return 1.0f;
+		}
+#endif
 	}
 	else {
-		/* other rays */
-		if((state->bounce >= kernel_data.integrator.max_bounce) ||
-		   (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) ||
-		   (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) ||
-#ifdef __VOLUME__
-		   (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) ||
-#endif
-		   (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce))
-		{
-			return 0.0f;
+		/* Do at least one bounce without RR. */
+		if(state->bounce <= 1) {
+			return 1.0f;
 		}
-		else if(state->bounce <= kernel_data.integrator.min_bounce) {
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
 			return 1.0f;
 		}
+#endif
 	}
 
-	/* probalistic termination */
-	return average(throughput); /* todo: try using max here */
+	/* Probabilistic termination: use sqrt() to roughly match typical view
+	 * transform and do path termination a bit later on average. */
+	return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
 }
 
 /* TODO(DingTo): Find more meaningful name for this */
@@ -180,5 +245,28 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state,
 		state->bounce -= 1;
 }
 
+ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
+{
+    if(state->bounce <= kernel_data.integrator.ao_bounces) {
+        return false;
+    }
+
+    int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+    return (bounce > kernel_data.integrator.ao_bounces);
+}
+
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+                                         int branch,
+                                         int num_branches)
+{
+	if(num_branches > 1) {
+		/* Path is splitting into a branch, adjust so that each branch
+		 * still gets a unique sample from the same sequence. */
+		state->sample = state->sample*num_branches + branch;
+		state->num_samples = state->num_samples*num_branches;
+		state->branch_factor *= num_branches;
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
new file mode 100644
index 00000000000..71aea9e3b27
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	float bssrdf_u, bssrdf_v;
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+	const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
+
+	/* do bssrdf scatter step if we picked a bssrdf closure */
+	if(sc) {
+		/* We should never have two consecutive BSSRDF bounces,
+		 * the second one should be converted to a diffuse BSDF to
+		 * avoid this.
+		 */
+		kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
+
+		uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
+
+		LocalIntersection ss_isect;
+		int num_hits = subsurface_scatter_multi_intersect(kg,
+		                                                  &ss_isect,
+		                                                  sd,
+		                                                  state,
+		                                                  sc,
+		                                                  &lcg_state,
+		                                                  bssrdf_u, bssrdf_v,
+		                                                  false);
+#  ifdef __VOLUME__
+		bool need_update_volume_stack =
+		        kernel_data.integrator.use_volumes &&
+		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#  endif  /* __VOLUME__ */
+
+		/* compute lighting with the BSDF closure */
+		for(int hit = 0; hit < num_hits; hit++) {
+			/* NOTE: We reuse the existing ShaderData, we assume the path
+			 * integration loop stops when this function returns true.
+			 */
+			subsurface_scatter_multi_setup(kg,
+			                               &ss_isect,
+			                               hit,
+			                               sd,
+			                               state,
+			                               sc);
+
+			kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+			ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
+			ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+			ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+			PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
+
+			*hit_state = *state;
+			*hit_ray = *ray;
+			*hit_tp = *throughput;
+			*hit_L_state = L->state;
+
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
+
+			if(kernel_path_surface_bounce(kg,
+			                              sd,
+			                              hit_tp,
+			                              hit_state,
+			                              hit_L_state,
+			                              hit_ray))
+			{
+#  ifdef __LAMP_MIS__
+				hit_state->ray_t = 0.0f;
+#  endif  /* __LAMP_MIS__ */
+
+#  ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					Ray volume_ray = *ray;
+					/* Setup ray from previous surface point to the new one. */
+					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    emission_sd,
+					    &volume_ray,
+					    hit_state->volume_stack);
+				}
+#  endif  /* __VOLUME__ */
+				ss_indirect->num_rays++;
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+ccl_device_inline void kernel_path_subsurface_init_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	ss_indirect->num_rays = 0;
+}
+
+ccl_device void kernel_path_subsurface_setup_indirect(
+        KernelGlobals *kg,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        PathRadiance *L,
+        ccl_addr_space float3 *throughput)
+{
+	/* Setup state, ray and throughput for indirect SSS rays. */
+	ss_indirect->num_rays--;
+
+	path_radiance_sum_indirect(L);
+	path_radiance_reset_indirect(L);
+
+	*state = ss_indirect->state[ss_indirect->num_rays];
+	*ray = ss_indirect->rays[ss_indirect->num_rays];
+	L->state = ss_indirect->L_state[ss_indirect->num_rays];
+	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
+
+	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
+}
+
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..27be90d5059 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,16 +16,21 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput,
-	float num_samples_adjust, PathRadiance *L, int sample_all_lights)
+ccl_device_noinline void kernel_branched_path_surface_connect_light(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float num_samples_adjust,
+        PathRadiance *L,
+        int sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -33,7 +38,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	bool is_lamp;
 
 #  ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #  endif
 
 	if(sample_all_lights) {
@@ -44,15 +49,15 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples);
 
 				LightSample ls;
-				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
 					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -62,9 +67,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
+						}
+						else {
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -77,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 			float num_samples_inv = num_samples_adjust/num_samples;
 
 			for(int j = 0; j < num_samples; j++) {
-				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+				if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -96,9 +103,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
+						}
+						else {
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -107,21 +117,23 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	}
 	else {
 		/* sample one light at random */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+		float terminate = path_state_rng_light_termination(kg, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
+				}
+				else {
+					path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
 				}
 			}
 		}
@@ -130,9 +142,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 }
 
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
-	float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device bool kernel_branched_path_surface_bounce(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        const ShaderClosure *sc,
+        int sample,
+        int num_samples,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space PathState *state,
+        PathRadianceState *L_state,
+        ccl_addr_space Ray *ray,
+        float sum_sample_weight)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -140,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	float3 bsdf_omega_in;
 	differential3 bsdf_domega_in;
 	float bsdf_u, bsdf_v;
-	path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 	int label;
 
 	label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -150,21 +170,25 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return false;
 
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+#ifdef __DENOISING_FEATURES__
+	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
 
 	/* modify path state */
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 	ray->D = normalize(bsdf_omega_in);
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = ccl_fetch(sd, dP);
+	ray->dP = sd->dP;
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = ccl_fetch(sd, time);
+	ray->time = sd->time;
 #endif
 
 #ifdef __VOLUME__
@@ -188,64 +212,77 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
-#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
 		return;
 
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           1);
+		return;
+	}
+#endif
+
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+			}
+			else {
+				path_radiance_accum_total_light(L, state, throughput, &L_light);
 			}
 		}
 	}
 #endif
 }
-#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ccl_addr_space RNG *rng,
                                            ShaderData *sd,
                                            ccl_addr_space float3 *throughput,
                                            ccl_addr_space PathState *state,
-                                           PathRadiance *L,
+                                           PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(ccl_fetch(sd, flag) & SD_BSDF) {
+	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
 		float bsdf_u, bsdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 		int label;
 
 		label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -255,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 			return false;
 
 		/* modify throughput */
-		path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+		path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
 		/* set labels */
 		if(!(label & LABEL_TRANSPARENT)) {
@@ -270,16 +307,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 		ray->D = normalize(bsdf_omega_in);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -291,21 +328,20 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
-		/* no surface shader but have a volume shader? act transparent */
-
-		/* update path state, count as transparent */
-		path_state_next(kg, state, LABEL_TRANSPARENT);
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+		if(!path_state_volume_next(kg, state)) {
+			return false;
+		}
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, -sd->Ng);
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 3d3b7385d8b..6275d0d6562 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -20,11 +20,10 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_volume_connect_light(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
-        PathState *state,
+        ccl_addr_space PathState *state,
         PathRadiance *L)
 {
 #ifdef __EMISSION__
@@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light(
 		return;
 
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
@@ -42,24 +40,22 @@ ccl_device_inline void kernel_path_volume_connect_light(
 	bool is_lamp;
 
 	/* connect to light from given point where shader has been evaluated */
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
-	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
 	{
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -67,8 +63,13 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+bool kernel_path_volume_bounce(
+    KernelGlobals *kg,
+    ShaderData *sd,
+    ccl_addr_space float3 *throughput,
+    ccl_addr_space PathState *state,
+    PathRadianceState *L_state,
+    ccl_addr_space Ray *ray)
 {
 	/* sample phase function */
 	float phase_pdf;
@@ -76,7 +77,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	float3 phase_omega_in;
 	differential3 phase_domega_in;
 	float phase_u, phase_v;
-	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
 	int label;
 
 	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
@@ -86,7 +87,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 		return false;
 	
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
 
 	/* set labels */
 	state->ray_pdf = phase_pdf;
@@ -98,6 +99,23 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	/* update path state */
 	path_state_next(kg, state, label);
 
+	/* Russian roulette termination of volume ray scattering. */
+	float probability = path_state_continuation_probability(kg, state, *throughput);
+
+	if(probability == 0.0f) {
+		return false;
+	}
+	else if(probability != 1.0f) {
+		/* Use dimension from the previous bounce, has not been used yet. */
+		float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
+
+		if(terminate >= probability) {
+			return false;
+		}
+
+		*throughput /= probability;
+	}
+
 	/* setup ray */
 	ray->P = sd->P;
 	ray->D = phase_omega_in;
@@ -111,9 +129,17 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L,
-	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+#ifndef __SPLIT_KERNEL__
+ccl_device void kernel_branched_path_volume_connect_light(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        bool sample_all_lights,
+        Ray *ray,
+        const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -123,9 +149,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	BsdfEval L_light;
 	bool is_lamp;
 
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
 	if(sample_all_lights) {
 		/* lamp sampling */
@@ -135,12 +159,12 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 			int num_samples = light_select_num_samples(kg, i);
 			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on given light */
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
 				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
@@ -148,28 +172,26 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -183,42 +205,39 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on random triangle */
-				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+				light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 					
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -227,44 +246,42 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	}
 	else {
 		/* sample random position on random light */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+		light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
 		/* sample position on volume segment */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-		float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+		float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 			state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 			
-		(void)result;
-		kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		if(result == VOLUME_PATH_SCATTERED &&
+		   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			float terminate = path_state_rng_light_termination(kg, state);
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
+#endif /* __SPLIT_KERNEL__ */
 
-#endif
+#endif /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..4540d733af4 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
+	if(is_zero(dir))
+		return make_float2(0.0f, 0.0f);
+
 	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
 	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
@@ -192,49 +195,49 @@ ccl_device float2 direction_to_mirrorball(float3 dir)
 	return make_float2(u, v);
 }
 
-ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
+ccl_device_inline float3 panorama_to_direction(ccl_constant KernelCamera *cam, float u, float v)
 {
-	switch(kernel_data.cam.panorama_type) {
+	switch(cam->panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range);
+			return equirectangular_range_to_direction(u, v, cam->equirectangular_range);
 		case PANORAMA_MIRRORBALL:
 			return mirrorball_to_direction(u, v);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
-			return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov);
+			return fisheye_to_direction(u, v, cam->fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
 		default:
-			return fisheye_equisolid_to_direction(u, v, kernel_data.cam.fisheye_lens,
-				kernel_data.cam.fisheye_fov, kernel_data.cam.sensorwidth, kernel_data.cam.sensorheight);
+			return fisheye_equisolid_to_direction(u, v, cam->fisheye_lens,
+				cam->fisheye_fov, cam->sensorwidth, cam->sensorheight);
 	}
 }
 
-ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
+ccl_device_inline float2 direction_to_panorama(ccl_constant KernelCamera *cam, float3 dir)
 {
-	switch(kernel_data.cam.panorama_type) {
+	switch(cam->panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range);
+			return direction_to_equirectangular_range(dir, cam->equirectangular_range);
 		case PANORAMA_MIRRORBALL:
 			return direction_to_mirrorball(dir);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
-			return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov);
+			return direction_to_fisheye(dir, cam->fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
 		default:
-			return direction_to_fisheye_equisolid(dir, kernel_data.cam.fisheye_lens,
-				kernel_data.cam.sensorwidth, kernel_data.cam.sensorheight);
+			return direction_to_fisheye_equisolid(dir, cam->fisheye_lens,
+				cam->sensorwidth, cam->sensorheight);
 	}
 }
 
-ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P, float3 *D)
+ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam, float3 *P, float3 *D)
 {
-	float interocular_offset = kernel_data.cam.interocular_offset;
+	float interocular_offset = cam->interocular_offset;
 
 	/* Interocular offset of zero means either non stereo, or stereo without
 	 * spherical stereo. */
 	kernel_assert(interocular_offset != 0.0f);
 
-	if(kernel_data.cam.pole_merge_angle_to > 0.0f) {
-		const float pole_merge_angle_from = kernel_data.cam.pole_merge_angle_from,
-		            pole_merge_angle_to = kernel_data.cam.pole_merge_angle_to;
+	if(cam->pole_merge_angle_to > 0.0f) {
+		const float pole_merge_angle_from = cam->pole_merge_angle_from,
+		            pole_merge_angle_to = cam->pole_merge_angle_to;
 		float altitude = fabsf(safe_asinf((*D).z));
 		if(altitude > pole_merge_angle_to) {
 			interocular_offset = 0.0f;
@@ -254,7 +257,7 @@ ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P,
 
 	/* Convergence distance is FLT_MAX in the case of parallel convergence mode,
 	 * no need to modify direction in this case either. */
-	const float convergence_distance = kernel_data.cam.convergence_distance;
+	const float convergence_distance = cam->convergence_distance;
 
 	if(convergence_distance != FLT_MAX)
 	{
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,12 +17,15 @@
 #ifndef __KERNEL_QUEUE_H__
 #define __KERNEL_QUEUE_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Queue utility functions for split kernel
  */
-
+#ifdef __KERNEL_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#endif
 
 /*
  * Enqueue ray index into the queue
@@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index(
         ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
 {
 	/* This thread's queue index. */
-	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+	                   + (queue_number * queue_size);
 	queues[my_queue_index] = ray_index;
 }
 
@@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local(
         int queue_number,                            /* Queue in which to enqueue ray index. */
         char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
         int queuesize,                               /* queue size. */
-        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_local_param unsigned int *local_queue_atomics,   /* To to local queue atomics. */
         ccl_global int *Queue_data,                  /* Queues. */
         ccl_global int *Queue_index)                 /* To do global queue atomics. */
 {
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
 
 	/* Get local queue id .*/
 	unsigned int lqidx;
 	if(enqueue_flag) {
-		lqidx = atomic_inc(local_queue_atomics);
+		lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue offset. */
 	if(lidx == 0) {
-		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+		*local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+		                                                   *local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue index and enqueue ray. */
 	if(enqueue_flag) {
@@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local(
 
 ccl_device unsigned int get_local_queue_index(
         int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-        ccl_local unsigned int *local_queue_atomics)
+        ccl_local_param unsigned int *local_queue_atomics)
 {
-	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
 	return my_lqidx;
 }
 
 ccl_device unsigned int get_global_per_queue_offset(
         int queue_number,
-        ccl_local unsigned int *local_queue_atomics,
+        ccl_local_param unsigned int *local_queue_atomics,
         ccl_global int* global_queue_atomics)
 {
-	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
-	                                       local_queue_atomics[queue_number]);
+	unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+	                                                        local_queue_atomics[queue_number]);
 	return queue_offset;
 }
 
@@ -116,10 +122,27 @@ ccl_device unsigned int get_global_queue_index(
     int queue_number,
     int queuesize,
     unsigned int lqidx,
-    ccl_local unsigned int * global_per_queue_offset)
+    ccl_local_param unsigned int * global_per_queue_offset)
 {
 	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
 	return my_gqidx;
 }
 
+ccl_device int dequeue_ray_index(
+        int queue_number,
+        ccl_global int *queues,
+        int queue_size,
+        ccl_global int *queue_index)
+{
+	int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+	if(index < 0) {
+		return QUEUE_EMPTY_SLOT;
+	}
+
+	return queues[index + queue_number * queue_size];
+}
+
+CCL_NAMESPACE_END
+
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index e773753396f..93152e9ff1c 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -14,107 +14,68 @@
  * limitations under the License.
  */
 
-#include "kernel_jitter.h"
+#include "kernel/kernel_jitter.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SOBOL__
-
-/* skip initial numbers that are not as well distributed, especially the
- * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination */
-#define SOBOL_SKIP 64
+/* Pseudo random numbers, uncomment this for debugging correlations. Only run
+ * this single threaded on a CPU for repeatable results. */
+//#define __DEBUG_CORRELATION__
 
-/* High Dimensional Sobol */
-
-/* van der corput radical inverse */
-ccl_device uint van_der_corput(uint bits)
-{
-	bits = (bits << 16) | (bits >> 16);
-	bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
-	bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
-	bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
-	bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
-	return bits;
-}
-
-/* sobol radical inverse */
-ccl_device uint sobol(uint i)
-{
-	uint r = 0;
-
-	for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
-		if(i & 1)
-			r ^= v;
-
-	return r;
-}
 
-/* inverse of sobol radical inverse */
-ccl_device uint sobol_inverse(uint i)
-{
-	const uint msb = 1U << 31;
-	uint r = 0;
+/* High Dimensional Sobol.
+ *
+ * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal
+ * to classic Van der Corput and Sobol sequences. */
 
-	for(uint v = 1; i; i <<= 1, v ^= v << 1)
-		if(i & msb)
-			r ^= v;
+#ifdef __SOBOL__
 
-	return r;
-}
+/* Skip initial numbers that for some dimensions have clear patterns that
+ * don't cover the entire sample space. Ideally we would have a better
+ * progressive pattern that doesn't suffer from this problem, because even
+ * with this offset some dimensions are quite poor.
+ */
+#define SOBOL_SKIP 64
 
-/* multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
 ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
 	uint result = 0;
-	uint i = index;
-
-	for(uint j = 0; i; i >>= 1, j++)
-		if(i & 1)
+	uint i = index + SOBOL_SKIP;
+	for(uint j = 0; i; i >>= 1, j++) {
+		if(i & 1) {
 			result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
-	
+		}
+	}
 	return result;
 }
 
-/* lookup index and x/y coordinate, assumes m is a power of two */
-ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
-{
-	/* shift is constant per frame */
-	const uint shift = frame << (m << 1);
-	const uint sobol_shift = sobol(shift);
-	/* van der Corput is its own inverse */
-	const uint lower = van_der_corput(ex << (32 - m));
-	/* need to compensate for ey difference and shift */
-	const uint sobol_lower = sobol(lower);
-	const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
-	const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
-	/* only use m upper bits for the index (m is a power of two) */
-	const uint sobol_result = delta | (delta >> m);
-	const uint upper = sobol_inverse(sobol_result);
-	const uint index = shift | upper | lower;
-	*x = van_der_corput(index);
-	*y = sobol_shift ^ sobol_result ^ sobol_lower;
-	return index;
-}
+#endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample, int num_samples,
+                                         int dimension)
 {
+#ifdef __DEBUG_CORRELATION__
+	return (float)drand48();
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
-		int p = *rng + dimension;
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
+		/* Correlated multi-jitter. */
+		int p = rng_hash + dimension;
 		return cmj_sample_1D(sample, num_samples, p);
 	}
 #endif
 
-#ifdef __SOBOL_FULL_SCREEN__
-	uint result = sobol_dimension(kg, *rng, dimension);
-	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
-	return r;
-#else
-	/* compute sobol sequence value using direction vectors */
-	uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
+#ifdef __SOBOL__
+	/* Sobol sequence value using direction vectors. */
+	uint result = sobol_dimension(kg, sample, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 
 	/* Cranly-Patterson rotation using rng seed */
@@ -123,111 +84,67 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *
 	/* Hash rng with dimension to solve correlation issues.
 	 * See T38710, T50116.
 	 */
-	RNG tmp_rng = cmj_hash_simple(dimension, *rng);
+	uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
 	shift = tmp_rng * (1.0f/(float)0xFFFFFFFF);
 
 	return r + shift - floorf(r + shift);
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
+                                        uint rng_hash,
+                                        int sample, int num_samples,
+                                        int dimension,
+                                        float *fx, float *fy)
 {
+#ifdef __DEBUG_CORRELATION__
+	*fx = (float)drand48();
+	*fy = (float)drand48();
+	return;
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
-		int p = *rng + dimension;
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
+		/* Correlated multi-jitter. */
+		int p = rng_hash + dimension;
 		cmj_sample_2D(sample, num_samples, p, fx, fy);
+		return;
 	}
-	else
 #endif
-	{
-		/* sobol */
-		*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-		*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-	}
-}
-
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
-{
-#ifdef __SOBOL_FULL_SCREEN__
-	uint px, py;
-	uint bits = 16; /* limits us to 65536x65536 and 65536 samples */
-	uint size = 1 << bits;
-	uint frame = sample;
-
-	*rng = sobol_lookup(bits, frame, x, y, &px, &py);
-
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		*fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x;
-		*fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y;
-	}
-#else
-	*rng = *rng_state;
 
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
-	}
+#ifdef __SOBOL__
+	/* Sobol. */
+	*fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
+	*fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
 #endif
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
-{
-	/* nothing to do */
-}
-
-#else
-
-/* Linear Congruential Generator */
-
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
-{
-	/* implicit mod 2^32 */
-	rng = (1103515245*(rng) + 12345);
-	return (float)rng * (1.0f/(float)0xFFFFFFFF);
-}
-
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
-{
-	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-}
-
-ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg,
+                                     int sample, int num_samples,
+                                     uint *rng_hash,
+                                     int x, int y,
+                                     float *fx, float *fy)
 {
 	/* load state */
-	*rng = *rng_state;
+	*rng_hash = hash_int_2d(x, y);
+	*rng_hash ^= kernel_data.integrator.seed;
 
-	*rng ^= kernel_data.integrator.seed;
+#ifdef __DEBUG_CORRELATION__
+	srand48(*rng_hash + sample);
+#endif
 
 	if(sample == 0) {
 		*fx = 0.5f;
 		*fy = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
 	}
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
-{
-	/* store state for next sample */
-	*rng_state = rng;
-}
-
-#endif
-
 /* Linear Congruential Generator */
 
 ccl_device uint lcg_step_uint(uint *rng)
@@ -257,90 +174,123 @@ ccl_device uint lcg_init(uint seed)
  * dimension to avoid using the same sequence twice.
  *
  * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly. */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
-{
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
-}
+ * in a sequence and offset accordingly.
+ */
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
+                                          const ccl_addr_space PathState *state,
+                                          int dimension)
 {
-	/* the rng_offset is not increased for transparent bounces. if we do then
-	 * fully transparent objects can become subtly visible by the different
-	 * sampling patterns used where the transparent object is.
-	 *
-	 * however for some random numbers that will determine if we next bounce
-	 * is transparent we do need to increase the offset to avoid always making
-	 * the same decision */
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+	return path_rng_1D(kg,
+	                   state->rng_hash,
+	                   state->sample, state->num_samples,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
+                                         const ccl_addr_space PathState *state,
+                                         int dimension,
+                                         float *fx, float *fy)
 {
-	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
+	path_rng_2D(kg,
+	            state->rng_hash,
+	            state->sample, state->num_samples,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
+                                          const ccl_addr_space PathState *state,
+                                          uint hash)
 {
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
+	/* Use a hash instead of dimension, this is not great but avoids adding
+	 * more dimensions to each bounce which reduces quality of dimensions we
+	 * are already using. */
+	return path_rng_1D(kg,
+	                   cmj_hash_simple(state->rng_hash, hash),
+	                   state->sample, state->num_samples,
+	                   state->rng_offset);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension)
 {
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+	return path_rng_1D(kg,
+	                   rng_hash,
+	                   state->sample * num_branches + branch,
+	                   state->num_samples * num_branches,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension,
+        float *fx, float *fy)
 {
-	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
+	path_rng_2D(kg,
+	            rng_hash,
+	            state->sample * num_branches + branch,
+	            state->num_samples * num_branches,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-/* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state)
+/* Utitility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(
+        KernelGlobals *kg,
+        const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+		return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+		return path_branched_rng_1D(kg,
+		                            rng_hash,
+		                            state,
+		                            branch,
+		                            num_branches,
+		                            PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
-{
-	/* path is splitting into a branch, adjust so that each branch
-	 * still gets a unique sample from the same sequence */
-	state->rng_offset += PRNG_BOUNCE_NUM;
-	state->sample = state->sample*num_branches + branch;
-	state->num_samples = state->num_samples*num_branches;
-}
-
-ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(PathState *state,
+                                      uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
-/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng,
-                                                const ccl_addr_space PathState *state,
+ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state,
                                                 uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
+
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
 {
-	/* implicit mod 2^32 */
+	/* Implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index d0826e5e879..937a50cba8b 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -24,12 +24,12 @@
  *
  */
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf.h"
-#include "closure/emissive.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/emissive.h"
 
-#include "svm/svm.h"
+#include "kernel/svm/svm.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
-		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+	if(sd->object_flag & SD_OBJECT_MOTION) {
+		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -55,55 +55,56 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Ray *ray)
 {
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
+	sd->lamp = LAMP_NONE;
 
-	ccl_fetch(sd, type) = isect->type;
-	ccl_fetch(sd, flag) = 0;
-	ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag,
-	                                              ccl_fetch(sd, object));
+	sd->type = isect->type;
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag,
+	                                              sd->object);
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	ccl_fetch(sd, time) = ray->time;
 #endif
+	sd->time = ray->time;
 
-	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
-	ccl_fetch(sd, ray_length) = isect->t;
+	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+	sd->ray_length = isect->t;
 
 #ifdef __UV__
-	ccl_fetch(sd, u) = isect->u;
-	ccl_fetch(sd, v) = isect->v;
+	sd->u = isect->u;
+	sd->v = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
-		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
-		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+		sd->shader = __float_as_int(curvedata.z);
+		sd->P = curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
-		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
-		ccl_fetch(sd, Ng) = Ng;
-		ccl_fetch(sd, N) = Ng;
-		
+		sd->P = triangle_refine(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
+
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
 	}
 	else {
@@ -111,40 +112,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	ccl_fetch(sd, I) = -ray->D;
+	sd->I = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #  ifdef __DPDU__
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #  endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 	if(backfacing) {
-		ccl_fetch(sd, flag) |= SD_BACKFACING;
-		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+		sd->flag |= SD_BACKFACING;
+		sd->Ng = -sd->Ng;
+		sd->N = -sd->N;
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+		sd->dPdu = -sd->dPdu;
+		sd->dPdv = -sd->dPdv;
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
-	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
-	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+	differential_incoming(&sd->dI, ray->dD);
+	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -181,12 +182,12 @@ void shader_setup_from_subsurface(
 		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* static triangle */
-		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
+		sd->P = triangle_refine_local(kg, sd, isect, ray);
 		sd->Ng = Ng;
 		sd->N = Ng;
 
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #  ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -198,16 +199,16 @@ void shader_setup_from_subsurface(
 		motion_triangle_shader_setup(kg, sd, isect, ray, true);
 	}
 
-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
 #  ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #    ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #    endif
 	}
 #  endif
@@ -249,106 +250,108 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int lamp)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = P;
-	ccl_fetch(sd, N) = Ng;
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, I) = I;
-	ccl_fetch(sd, shader) = shader;
+	sd->P = P;
+	sd->N = Ng;
+	sd->Ng = Ng;
+	sd->I = I;
+	sd->shader = shader;
 	if(prim != PRIM_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+		sd->type = PRIMITIVE_TRIANGLE;
 	else if(lamp != LAMP_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+		sd->type = PRIMITIVE_LAMP;
 	else
-		ccl_fetch(sd, type) = PRIMITIVE_NONE;
+		sd->type = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = object;
+	sd->object = object;
 #endif
+	sd->lamp = LAMP_NONE;
 	/* currently no access to bvh prim index for strand sd->prim*/
-	ccl_fetch(sd, prim) = prim;
+	sd->prim = prim;
 #ifdef __UV__
-	ccl_fetch(sd, u) = u;
-	ccl_fetch(sd, v) = v;
+	sd->u = u;
+	sd->v = v;
 #endif
-	ccl_fetch(sd, ray_length) = t;
+	sd->time = time;
+	sd->ray_length = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag,
-		                                               ccl_fetch(sd, object));
+	sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+	sd->object_flag = 0;
+	if(sd->object != OBJECT_NONE) {
+		sd->object_flag |= kernel_tex_fetch(__object_flag,
+		                                    sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		ccl_fetch(sd, time) = time;
 	}
 	else if(lamp != LAMP_NONE) {
-		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
-		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
+		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
+		sd->lamp = lamp;
 #endif
 	}
 
 	/* transform into world space */
 	if(object_space) {
-		object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+		object_position_transform_auto(kg, sd, &sd->P);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
+		sd->N = sd->Ng;
+		object_dir_transform_auto(kg, sd, &sd->I);
 	}
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-			if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				object_normal_transform_auto(kg, sd, &sd->N);
 			}
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #  ifdef __INSTANCING__
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+			object_dir_transform_auto(kg, sd, &sd->dPdu);
+			object_dir_transform_auto(kg, sd, &sd->dPdv);
 		}
 #  endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(ccl_fetch(sd, prim) != PRIM_NONE) {
-		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	if(sd->prim != PRIM_NONE) {
+		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
-			ccl_fetch(sd, flag) |= SD_BACKFACING;
-			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+			sd->flag |= SD_BACKFACING;
+			sd->Ng = -sd->Ng;
+			sd->N = -sd->N;
 #ifdef __DPDU__
-			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+			sd->dPdu = -sd->dPdu;
+			sd->dPdv = -sd->dPdv;
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	ccl_fetch(sd, dP) = differential3_zero();
-	ccl_fetch(sd, dI) = differential3_zero();
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = differential3_zero();
+	sd->dI = differential3_zero();
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -378,39 +381,38 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = ray->D;
-	ccl_fetch(sd, N) = -ray->D;
-	ccl_fetch(sd, Ng) = -ray->D;
-	ccl_fetch(sd, I) = -ray->D;
-	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
-#ifdef __OBJECT_MOTION__
-	ccl_fetch(sd, time) = ray->time;
-#endif
-	ccl_fetch(sd, ray_length) = 0.0f;
+	sd->P = ray->D;
+	sd->N = -ray->D;
+	sd->Ng = -ray->D;
+	sd->I = -ray->D;
+	sd->shader = kernel_data.background.surface_shader;
+	sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+	sd->object_flag = 0;
+	sd->time = ray->time;
+	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = PRIM_NONE;
+	sd->object = PRIM_NONE;
 #endif
-	ccl_fetch(sd, prim) = PRIM_NONE;
+	sd->lamp = LAMP_NONE;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
-	ccl_fetch(sd, u) = 0.0f;
-	ccl_fetch(sd, v) = 0.0f;
+	sd->u = 0.0f;
+	sd->v = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	ccl_fetch(sd, dP) = ray->dD;
-	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = ray->dD;
+	differential_incoming(&sd->dI, sd->dP);
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -421,47 +423,46 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 {
 	/* vectors */
 	sd->P = ray->P;
-	sd->N = -ray->D;  
+	sd->N = -ray->D;
 	sd->Ng = -ray->D;
 	sd->I = -ray->D;
 	sd->shader = SHADER_NONE;
 	sd->flag = 0;
 	sd->object_flag = 0;
-#ifdef __OBJECT_MOTION__
 	sd->time = ray->time;
-#endif
 	sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 
-#ifdef __INSTANCING__
+#  ifdef __INSTANCING__
 	sd->object = PRIM_NONE; /* todo: fill this for texture coordinates */
-#endif
+#  endif
+	sd->lamp = LAMP_NONE;
 	sd->prim = PRIM_NONE;
 	sd->type = PRIMITIVE_NONE;
 
-#ifdef __UV__
+#  ifdef __UV__
 	sd->u = 0.0f;
 	sd->v = 0.0f;
-#endif
+#  endif
 
-#ifdef __DPDU__
+#  ifdef __DPDU__
 	/* dPdu/dPdv */
 	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
 	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
-#endif
+#  endif
 
-#ifdef __RAY_DIFFERENTIALS__
+#  ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
 	sd->dP = ray->dD;
 	differential_incoming(&sd->dI, sd->dP);
 	sd->du = differential_zero();
 	sd->dv = differential_zero();
-#endif
+#  endif
 
 	/* for NDC coordinates */
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
 }
-#endif
+#endif  /* __VOLUME__ */
 
 /* Merging */
 
@@ -496,27 +497,52 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
 		}
 	}
 }
-#endif
+#endif  /* __BRANCHED_PATH__ || __VOLUME__ */
+
+/* Defensive sampling. */
+
+ccl_device_inline void shader_prepare_closures(ShaderData *sd,
+                                               ccl_addr_space PathState *state)
+{
+	/* We can likely also do defensive sampling at deeper bounces, particularly
+	 * for cases like a perfect mirror but possibly also others. This will need
+	 * a good heuristic. */
+	if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+		float sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sum += sc->sample_weight;
+			}
+		}
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+			}
+		}
+	}
+}
+
 
 /* BSDF */
 
 ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf,
-	int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+	const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		if(i == skip_bsdf)
-			continue;
-
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
-		if(CLOSURE_IS_BSDF(sc->type)) {
+		if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
 
 			if(bsdf_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight);
+				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f);
 				sum_pdf += bsdf_pdf*sc->sample_weight;
 			}
 
@@ -535,8 +561,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
                                                         float light_pdf,
                                                         bool use_mis)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -544,12 +570,13 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
 				float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f;
 				bsdf_eval_accum(result_eval,
 				                sc->type,
-				                eval * sc->weight * mis_weight);
+				                eval * sc->weight,
+				                mis_weight);
 			}
 		}
 	}
 }
-#endif
+#endif  /* __BRANCHED_PATH__ */
 
 
 #ifndef __KERNEL_CUDA__
@@ -573,56 +600,134 @@ void shader_bsdf_eval(KernelGlobals *kg,
 #endif
 	{
 		float pdf;
-		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
+		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, weight);
+			bsdf_eval_mis(eval, weight);
 		}
 	}
 }
 
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu, float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
+ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd,
+                                                        float *randu)
 {
+	/* Note the sampling here must match shader_bssrdf_pick,
+	 * since we reuse the same random number. */
 	int sampled = 0;
 
-	if(ccl_fetch(sd, num_closure) > 1) {
-		/* pick a BSDF closure based on sample weights */
+	if(sd->num_closure > 1) {
+		/* Pick a BSDF or based on sample weights. */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
-			
-			if(CLOSURE_IS_BSDF(sc->type))
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
 				sum += sc->sample_weight;
+			}
 		}
 
-		float r = ccl_fetch(sd, randb_closure)*sum;
-		sum = 0.0f;
+		float r = (*randu)*sum;
+		float partial_sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
-			
-			if(CLOSURE_IS_BSDF(sc->type)) {
-				sum += sc->sample_weight;
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
 
-				if(r <= sum)
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					sampled = i;
+
+					/* Rescale to reuse for direction sample, to better
+					 * preserve stratifaction. */
+					*randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
+	}
 
-		if(sampled == ccl_fetch(sd, num_closure)) {
-			*pdf = 0.0f;
-			return LABEL_NONE;
+	const ShaderClosure *sc = &sd->closure[sampled];
+	return CLOSURE_IS_BSDF(sc->type)? sc: NULL;
+}
+
+ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
+                                                          ccl_addr_space float3 *throughput,
+                                                          float *randu)
+{
+	/* Note the sampling here must match shader_bsdf_pick,
+	 * since we reuse the same random number. */
+	int sampled = 0;
+
+	if(sd->num_closure > 1) {
+		/* Pick a BSDF or BSSRDF or based on sample weights. */
+		float sum_bsdf = 0.0f;
+		float sum_bssrdf = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF(sc->type)) {
+				sum_bsdf += sc->sample_weight;
+			}
+			else if(CLOSURE_IS_BSSRDF(sc->type)) {
+				sum_bssrdf += sc->sample_weight;
+			}
+		}
+
+		float r = (*randu)*(sum_bsdf + sum_bssrdf);
+		float partial_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					if(CLOSURE_IS_BSDF(sc->type)) {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
+						return NULL;
+					}
+					else {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
+						sampled = i;
+
+						/* Rescale to reuse for direction sample, to better
+						 * preserve stratifaction. */
+						*randu = (r - partial_sum) / sc->sample_weight;
+						break;
+					}
+				}
+
+				partial_sum = next_sum;
+			}
 		}
 	}
 
-	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+	const ShaderClosure *sc = &sd->closure[sampled];
+	return CLOSURE_IS_BSSRDF(sc->type)? sc: NULL;
+}
+
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float randu, float randv,
+                                         BsdfEval *bsdf_eval,
+                                         float3 *omega_in,
+                                         differential3 *domega_in,
+                                         float *pdf)
+{
+	const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
+	if(sc == NULL) {
+		*pdf = 0.0f;
+		return LABEL_NONE;
+	}
+
+	/* BSSRDF should already have been handled elsewhere. */
+	kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
 	int label;
 	float3 eval;
@@ -633,9 +738,9 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(ccl_fetch(sd, num_closure) > 1) {
+		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
-			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
+			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight);
 		}
 	}
 
@@ -658,31 +763,63 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 	return label;
 }
 
+ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+{
+	float roughness = 0.0f;
+	float sum_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(CLOSURE_IS_BSDF(sc->type)) {
+			/* sqrt once to undo the squaring from multiplying roughness on the
+			 * two axes, and once for the squared roughness convention. */
+			float weight = fabsf(average(sc->weight));
+			roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
+			sum_weight += weight;
+		}
+	}
+
+	return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
+}
+
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
 	}
 }
 
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+	if(sd->flag & SD_HAS_ONLY_VOLUME) {
 		return make_float3(1.0f, 1.0f, 1.0f);
+	}
+	else if(sd->flag & SD_TRANSPARENT) {
+		return sd->closure_transparent_extinction;
+	}
+	else {
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
 
-	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+{
+	if(sd->flag & SD_TRANSPARENT) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+			if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+				sc->sample_weight = 0.0f;
+				sc->weight = make_float3(0.0f, 0.0f, 0.0f);
+			}
+		}
 
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
-			eval += sc->weight;
+		sd->flag &= ~SD_TRANSPARENT;
 	}
-
-	return eval;
 }
 
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
@@ -691,7 +828,7 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 
 	alpha = max(alpha, make_float3(0.0f, 0.0f, 0.0f));
 	alpha = min(alpha, make_float3(1.0f, 1.0f, 1.0f));
-	
+
 	return alpha;
 }
 
@@ -699,8 +836,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -713,8 +850,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -727,8 +864,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -741,8 +878,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -751,31 +888,39 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+{
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+		if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			N += sc->N*fabsf(average(sc->weight));
+	}
+
+	return (is_zero(N))? sd->N : normalize(N);
+}
+
 ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
 			eval += sc->weight*ao_factor;
-			N += bsdf->N*average(sc->weight);
+			N += bsdf->N*fabsf(average(sc->weight));
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += ccl_fetch(sd, N)*average(sc->weight);
+			N += sd->N*fabsf(average(sc->weight));
 		}
 	}
 
-	if(is_zero(N))
-		N = ccl_fetch(sd, N);
-	else
-		N = normalize(N);
-
-	*N_ = N;
+	*N_ = (is_zero(N))? sd->N : normalize(N);
 	return eval;
 }
 
@@ -786,8 +931,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -801,35 +946,25 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+		*N_ = (is_zero(N))? sd->N: normalize(N);
 
 	if(texture_blur_)
-		*texture_blur_ = texture_blur/weight_sum;
-	
+		*texture_blur_ = safe_divide(texture_blur, weight_sum);
+
 	return eval;
 }
-#endif
+#endif  /* __SUBSURFACE__ */
 
 /* Emission */
 
-ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
-{
-	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
-}
-
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 {
-	float3 eval;
-	eval = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
-
-		if(CLOSURE_IS_EMISSION(sc->type))
-			eval += emissive_eval(kg, sd, sc)*sc->weight;
+	if(sd->flag & SD_EMISSION) {
+		return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
+	}
+	else {
+		return make_float3(0.0f, 0.0f, 0.0f);
 	}
-
-	return eval;
 }
 
 /* Holdout */
@@ -838,8 +973,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -850,16 +985,26 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
-	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
+	ccl_addr_space PathState *state, int path_flag)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = randb;
+	/* If path is being terminated, we are tracing a shadow ray or evaluating
+	 * emission, then we don't need to store closures. The emission and shadow
+	 * shader data also do not have a closure array to save GPU memory. */
+	int max_closures;
+	if(path_flag & (PATH_RAY_TERMINATE|PATH_RAY_SHADOW|PATH_RAY_EMISSION)) {
+		max_closures = 0;
+	}
+	else {
+		max_closures = kernel_data.integrator.max_closures;
+	}
+
+	sd->num_closure = 0;
+	sd->num_closure_left = max_closures;
 
 #ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_surface(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_surface(kg, sd, state, path_flag);
 	else
 #endif
 	{
@@ -869,49 +1014,46 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
 		                                             sizeof(DiffuseBsdf),
 		                                             make_float3(0.8f, 0.8f, 0.8f));
-		bsdf->N = ccl_fetch(sd, N);
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+		if (bsdf != NULL) {
+			bsdf->N = sd->N;
+			sd->flag |= bsdf_diffuse_setup(bsdf);
+		}
 #endif
 	}
 
-	if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
-		ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+	if(sd->flag & SD_BSDF_NEEDS_LCG) {
+		sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
 	}
 }
 
 /* Background Evaluation */
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
-	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
+	ccl_addr_space PathState *state, int path_flag)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_left = 0;
 
 #ifdef __SVM__
-#ifdef __OSL__
+#  ifdef __OSL__
 	if(kg->osl) {
-		OSLShader::eval_background(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_background(kg, sd, state, path_flag);
 	}
 	else
-#endif
+#  endif  /* __OSL__ */
 	{
 		svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
 	}
 
-	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
-
-		if(CLOSURE_IS_BACKGROUND(sc->type))
-			eval += sc->weight;
+	if(sd->flag & SD_EMISSION) {
+		return sd->closure_emission_background;
 	}
-
-	return eval;
-#else
+	else {
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+#else  /* __SVM__ */
 	return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+#endif  /* __SVM__ */
 }
 
 /* Volume */
@@ -932,7 +1074,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 			float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
 
 			if(phase_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval);
+				bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
 				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
@@ -963,22 +1105,27 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
 
 		for(sampled = 0; sampled < sd->num_closure; sampled++) {
 			const ShaderClosure *sc = &sd->closure[sampled];
-			
+
 			if(CLOSURE_IS_PHASE(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
-		sum = 0.0f;
+		float r = randu*sum;
+		float partial_sum = 0.0f;
 
 		for(sampled = 0; sampled < sd->num_closure; sampled++) {
 			const ShaderClosure *sc = &sd->closure[sampled];
-			
+
 			if(CLOSURE_IS_PHASE(sc->type)) {
-				sum += sc->sample_weight;
+				float next_sum = partial_sum + sc->sample_weight;
 
-				if(r <= sum)
+				if(r <= next_sum) {
+					/* Rescale to reuse for BSDF direction sample. */
+					randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
 
@@ -1024,15 +1171,25 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
                                           ShaderData *sd,
-                                          PathState *state,
-                                          VolumeStack *stack,
-                                          int path_flag,
-                                          ShaderContext ctx)
+                                          ccl_addr_space PathState *state,
+                                          ccl_addr_space VolumeStack *stack,
+                                          int path_flag)
 {
+	/* If path is being terminated, we are tracing a shadow ray or evaluating
+	 * emission, then we don't need to store closures. The emission and shadow
+	 * shader data also do not have a closure array to save GPU memory. */
+	int max_closures;
+	if(path_flag & (PATH_RAY_TERMINATE|PATH_RAY_SHADOW|PATH_RAY_EMISSION)) {
+		max_closures = 0;
+	}
+	else {
+		max_closures = kernel_data.integrator.max_closures;
+	}
+
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
 	sd->num_closure = 0;
-	sd->num_closure_extra = 0;
+	sd->num_closure_left = max_closures;
 	sd->flag = 0;
 	sd->object_flag = 0;
 
@@ -1040,10 +1197,11 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 		/* setup shaderdata from stack. it's mostly setup already in
 		 * shader_setup_from_volume, this switching should be quick */
 		sd->object = stack[i].object;
+		sd->lamp = LAMP_NONE;
 		sd->shader = stack[i].shader;
 
 		sd->flag &= ~SD_SHADER_FLAGS;
-		sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+		sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 		sd->object_flag &= ~SD_OBJECT_FLAGS;
 
 		if(sd->object != OBJECT_NONE) {
@@ -1060,7 +1218,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
 		if(kg->osl) {
-			OSLShader::eval_volume(kg, sd, state, path_flag, ctx);
+			OSLShader::eval_volume(kg, sd, state, path_flag);
 		}
 		else
 #  endif
@@ -1075,21 +1233,20 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 	}
 }
 
-#endif
+#endif  /* __VOLUME__ */
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
+ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_left = 0;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
 #  ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_displacement(kg, sd, ctx);
+		OSLShader::eval_displacement(kg, sd, state);
 	else
 #  endif
 	{
@@ -1117,11 +1274,10 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 		shader = __float_as_int(str.z);
 	}
 #endif
-	int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
+	int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 
 	return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
-#endif
+#endif  /* __TRANSPARENT_SHADOWS__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 06a77a208cb..8a0da6c3b13 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,6 +16,42 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __VOLUME__
+typedef struct VolumeState {
+#  ifdef __SPLIT_KERNEL__
+#  else
+	PathState ps;
+#  endif
+} VolumeState;
+
+/* Get PathState ready for use for volume stack evaluation. */
+#  ifdef __SPLIT_KERNEL__
+ccl_addr_space
+#  endif
+ccl_device_inline PathState *shadow_blocked_volume_path_state(
+        KernelGlobals *kg,
+        VolumeState *volume_state,
+        ccl_addr_space PathState *state,
+        ShaderData *sd,
+        Ray *ray)
+{
+#  ifdef __SPLIT_KERNEL__
+	ccl_addr_space PathState *ps =
+	        &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#  else
+	PathState *ps = &volume_state->ps;
+#  endif
+	*ps = *state;
+	/* We are checking for shadow on the "other" side of the surface, so need
+	 * to discard volume we are currently at.
+	 */
+	if(dot(sd->Ng, ray->D) < 0.0f) {
+		kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
+	}
+	return ps;
+}
+#endif  /* __VOLUME__ */
+
 /* Attenuate throughput accordingly to the given intersection event.
  * Returns true if the throughput is zero and traversal can be aborted.
  */
@@ -24,7 +60,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
         ShaderData *shadow_sd,
         ccl_addr_space PathState *state,
 #    ifdef __VOLUME__
-        struct PathState *volume_state,
+        ccl_addr_space struct PathState *volume_state,
 #    endif
         Intersection *isect,
         Ray *ray,
@@ -45,15 +81,12 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
 	/* Setup shader data at surface. */
 	shader_setup_from_ray(kg, shadow_sd, isect, ray);
 	/* Attenuation from transparent surface. */
-	if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
 		path_state_modify_bounce(state, true);
 		shader_eval_surface(kg,
 		                    shadow_sd,
-		                    NULL,
 		                    state,
-		                    0.0f,
-		                    PATH_RAY_SHADOW,
-		                    SHADER_CONTEXT_SHADOW);
+		                    PATH_RAY_SHADOW);
 		path_state_modify_bounce(state, false);
 		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
 	}
@@ -72,13 +105,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
 ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
                                       ShaderData *shadow_sd,
                                       ccl_addr_space PathState *state,
+                                      const uint visibility,
                                       Ray *ray,
                                       Intersection *isect,
                                       float3 *shadow)
 {
 	const bool blocked = scene_intersect(kg,
 	                                     *ray,
-	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
 	                                     isect,
 	                                     NULL,
 	                                     0.0f, 0.0f);
@@ -126,8 +160,10 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
  * Note that hits array should be as big as max_hits+1.
  */
 ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *sd,
                                                     ShaderData *shadow_sd,
                                                     ccl_addr_space PathState *state,
+                                                    const uint visibility,
                                                     Ray *ray,
                                                     Intersection *hits,
                                                     uint max_hits,
@@ -140,8 +176,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 	const bool blocked = scene_intersect_shadow_all(kg,
 	                                                ray,
 	                                                hits,
+	                                                visibility,
 	                                                max_hits,
 	                                                &num_hits);
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
 	/* If no opaque surface found but we did find transparent hits,
 	 * shade them.
 	 */
@@ -152,7 +192,14 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 		int bounce = state->transparent_bounce;
 		Intersection *isect = hits;
 #    ifdef __VOLUME__
-		PathState ps = *state;
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
 #    endif
 		sort_intersections(hits, num_hits);
 		for(int hit = 0; hit < num_hits; hit++, isect++) {
@@ -171,7 +218,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 			                                   shadow_sd,
 			                                   state,
 #ifdef __VOLUME__
-			                                   &ps,
+			                                   ps,
 #endif
 			                                   isect,
 			                                   ray,
@@ -180,7 +227,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 				return true;
 			}
 			/* Move ray forward. */
-			ray->P = ccl_fetch(shadow_sd, P);
+			ray->P = shadow_sd->P;
 			if(ray->t != FLT_MAX) {
 				ray->D = normalize_len(Pend - ray->P, &ray->t);
 			}
@@ -188,8 +235,8 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 		}
 #    ifdef __VOLUME__
 		/* Attenuation for last line segment towards light. */
-		if(ps.volume_stack[0].shader != SHADER_NONE) {
-			kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
 #    endif
 		*shadow = throughput;
@@ -197,8 +244,16 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 	}
 #    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* Apply attenuation from current volume shader/ */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+		/* Apply attenuation from current volume shader. */
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
 	}
 #    endif
 	return blocked;
@@ -208,13 +263,18 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
  * loop to help readability of the actual logic.
  */
 ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *sd,
                                                ShaderData *shadow_sd,
                                                ccl_addr_space PathState *state,
+                                               const uint visibility,
                                                Ray *ray,
                                                uint max_hits,
                                                float3 *shadow)
 {
-#    ifdef __KERNEL_CUDA__
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
 	Intersection *hits = kg->hits_stack;
 #    else
 	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
@@ -239,8 +299,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
 #    endif  /* __KERNEL_GPU__ */
 	/* Invoke actual traversal. */
 	return shadow_blocked_transparent_all_loop(kg,
+	                                           sd,
 	                                           shadow_sd,
 	                                           state,
+	                                           visibility,
 	                                           ray,
 	                                           hits,
 	                                           max_hits,
@@ -248,7 +310,7 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
 }
 #  endif  /* __SHADOW_RECORD_ALL__ */
 
-#  ifdef __KERNEL_GPU__
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
 /* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
@@ -263,20 +325,32 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
  */
 ccl_device bool shadow_blocked_transparent_stepped_loop(
         KernelGlobals *kg,
+        ShaderData *sd,
         ShaderData *shadow_sd,
         ccl_addr_space PathState *state,
+        const uint visibility,
         Ray *ray,
         Intersection *isect,
         const bool blocked,
         const bool is_transparent_isect,
         float3 *shadow)
 {
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
 	if(blocked && is_transparent_isect) {
 		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 		float3 Pend = ray->P + ray->D*ray->t;
 		int bounce = state->transparent_bounce;
 #    ifdef __VOLUME__
-		PathState ps = *state;
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
 #    endif
 		for(;;) {
 			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
@@ -284,7 +358,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 			}
 			if(!scene_intersect(kg,
 			                    *ray,
-			                    PATH_RAY_SHADOW_TRANSPARENT,
+			                    visibility & PATH_RAY_SHADOW_TRANSPARENT,
 			                    isect,
 			                    NULL,
 			                    0.0f, 0.0f))
@@ -299,7 +373,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 			                                   shadow_sd,
 			                                   state,
 #ifdef __VOLUME__
-			                                   &ps,
+			                                   ps,
 #endif
 			                                   isect,
 			                                   ray,
@@ -308,7 +382,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 				return true;
 			}
 			/* Move ray forward. */
-			ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
 			if(ray->t != FLT_MAX) {
 				ray->D = normalize_len(Pend - ray->P, &ray->t);
 			}
@@ -316,8 +390,8 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 		}
 #    ifdef __VOLUME__
 		/* Attenuation for last line segment towards light. */
-		if(ps.volume_stack[0].shader != SHADER_NONE) {
-			kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
 #    endif
 		*shadow *= throughput;
@@ -326,7 +400,15 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 #    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
 		/* Apply attenuation from current volume shader. */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
 	}
 #    endif
 	return blocked;
@@ -334,24 +416,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 
 ccl_device bool shadow_blocked_transparent_stepped(
         KernelGlobals *kg,
+        ShaderData *sd,
         ShaderData *shadow_sd,
         ccl_addr_space PathState *state,
+        const uint visibility,
         Ray *ray,
         Intersection *isect,
         float3 *shadow)
 {
-	const bool blocked = scene_intersect(kg,
-	                                     *ray,
-	                                     PATH_RAY_SHADOW_OPAQUE,
-	                                     isect,
-	                                     NULL,
-	                                     0.0f, 0.0f);
-	const bool is_transparent_isect = blocked
-	        ? shader_transparent_shadow(kg, isect)
-	        : false;
+	bool blocked = scene_intersect(kg,
+	                               *ray,
+	                               visibility & PATH_RAY_SHADOW_OPAQUE,
+	                               isect,
+	                               NULL,
+	                               0.0f, 0.0f);
+	bool is_transparent_isect = blocked
+		? shader_transparent_shadow(kg, isect)
+		: false;
 	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               sd,
 	                                               shadow_sd,
 	                                               state,
+	                                               visibility,
 	                                               ray,
 	                                               isect,
 	                                               blocked,
@@ -359,32 +445,30 @@ ccl_device bool shadow_blocked_transparent_stepped(
 	                                               shadow);
 }
 
-#  endif  /* __KERNEL_GPU__ */
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
 #endif /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *sd,
                                       ShaderData *shadow_sd,
                                       ccl_addr_space PathState *state,
-                                      ccl_addr_space Ray *ray_input,
+                                      Ray *ray_input,
                                       float3 *shadow)
 {
-	/* Special trickery for split kernel: some data is coming from the
-	 * global memory.
-	 */
-#ifdef __SPLIT_KERNEL__
-	Ray private_ray = *ray_input;
-	Ray *ray = &private_ray;
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
-#else  /* __SPLIT_KERNEL__ */
 	Ray *ray = ray_input;
-	Intersection isect_object;
-	Intersection *isect = &isect_object;
-#endif  /* __SPLIT_KERNEL__ */
+	Intersection isect;
 	/* Some common early checks. */
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 	if(ray->t == 0.0f) {
 		return false;
 	}
+#ifdef __SHADOW_TRICKS__
+	const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER)
+		? PATH_RAY_SHADOW_NON_CATCHER
+		: PATH_RAY_SHADOW;
+#else
+	const uint visibility = PATH_RAY_SHADOW;
+#endif
 	/* Do actual shadow shading. */
 	/* First of all, we check if integrator requires transparent shadows.
 	 * if not, we use simplest and fastest ever way to calculate occlusion.
@@ -396,8 +480,9 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 		return shadow_blocked_opaque(kg,
 		                             shadow_sd,
 		                             state,
+		                             visibility,
 		                             ray,
-		                             isect,
+		                             &isect,
 		                             shadow);
 	}
 #ifdef __TRANSPARENT_SHADOWS__
@@ -422,39 +507,45 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 	 */
 	const bool blocked = scene_intersect(kg,
 	                                     *ray,
-	                                     PATH_RAY_SHADOW_OPAQUE,
-	                                     isect,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
+	                                     &isect,
 	                                     NULL,
 	                                     0.0f, 0.0f);
 	const bool is_transparent_isect = blocked
-	        ? shader_transparent_shadow(kg, isect)
+	        ? shader_transparent_shadow(kg, &isect)
 	        : false;
 	if(!blocked || !is_transparent_isect ||
 	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
 	{
 		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               sd,
 		                                               shadow_sd,
 		                                               state,
+		                                               visibility,
 		                                               ray,
-		                                               isect,
+		                                               &isect,
 		                                               blocked,
 		                                               is_transparent_isect,
 		                                               shadow);
 	}
 #    endif  /* __KERNEL_GPU__ */
 	return shadow_blocked_transparent_all(kg,
+	                                      sd,
 	                                      shadow_sd,
 	                                      state,
+	                                      visibility,
 	                                      ray,
 	                                      max_hits,
 	                                      shadow);
 #  else  /* __SHADOW_RECORD_ALL__ */
 	/* Fallback to a slowest version which works on all devices. */
 	return shadow_blocked_transparent_stepped(kg,
+	                                          sd,
 	                                          shadow_sd,
 	                                          state,
+	                                          visibility,
 	                                          ray,
-	                                          isect,
+	                                          &isect,
 	                                          shadow);
 #  endif  /* __SHADOW_RECORD_ALL__ */
 #endif  /* __TRANSPARENT_SHADOWS__ */
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 52c05b85aee..e8553d84547 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -20,134 +20,82 @@ CCL_NAMESPACE_BEGIN
  *
  * BSSRDF Importance Sampling, SIGGRAPH 2013
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- *
- */
-
-/* TODO:
- * - test using power heuristic for combing bssrdfs
- * - try to reduce one sample model variance
  */
 
-#define BSSRDF_MULTI_EVAL
-
-ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
-{
-	/* sum sample weights of bssrdf and bsdf */
-	float bsdf_sum = 0.0f;
-	float bssrdf_sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSDF(sc->type))
-			bsdf_sum += sc->sample_weight;
-		else if(CLOSURE_IS_BSSRDF(sc->type))
-			bssrdf_sum += sc->sample_weight;
-	}
-
-	/* use bsdf or bssrdf? */
-	float r = sd->randb_closure*(bsdf_sum + bssrdf_sum);
-
-	if(r < bsdf_sum) {
-		/* use bsdf, and adjust randb so we can reuse it for picking a bsdf */
-		sd->randb_closure = r/bsdf_sum;
-		*probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f;
-		return NULL;
-	}
-
-	/* use bssrdf */
-	r -= bsdf_sum;
-
-	float sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			sum += sc->sample_weight;
-
-			if(r <= sum) {
-				sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight;
-
-#ifdef BSSRDF_MULTI_EVAL
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f;
-#else
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f;
-#endif
-				return sc;
-			}
-		}
-	}
-
-	/* should never happen */
-	sd->randb_closure = 0.0f;
-	*probability = 1.0f;
-	return NULL;
-}
-
 ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
-                                                 ShaderClosure *sc,
+                                                 const ShaderClosure *sc,
                                                  float disk_r,
                                                  float r,
                                                  bool all)
 {
-#ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
 	float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
 	float pdf_sum = 0.0f;
-	float sample_weight_sum = 0.0f;
-	int num_bssrdf = 0;
+	float sample_weight_inv = 0.0f;
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			float sample_weight = (all)? 1.0f: sc->sample_weight;
-			sample_weight_sum += sample_weight;
+	if(!all) {
+		float sample_weight_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			sc = &sd->closure[i];
+
+			if(CLOSURE_IS_DISK_BSSRDF(sc->type)) {
+				sample_weight_sum += sc->sample_weight;
+			}
 		}
-	}
 
-	float sample_weight_inv = 1.0f/sample_weight_sum;
+		sample_weight_inv = 1.0f/sample_weight_sum;
+	}
 
 	for(int i = 0; i < sd->num_closure; i++) {
 		sc = &sd->closure[i];
 		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
+		if(CLOSURE_IS_DISK_BSSRDF(sc->type)) {
 			/* in case of branched path integrate we sample all bssrdf's once,
 			 * for path trace we pick one, so adjust pdf for that */
 			float sample_weight = (all)? 1.0f: sc->sample_weight * sample_weight_inv;
 
 			/* compute pdf */
-			float pdf = bssrdf_pdf(sc, r);
-			float disk_pdf = bssrdf_pdf(sc, disk_r);
+			float3 eval = bssrdf_eval(sc, r);
+			float pdf = bssrdf_pdf(sc, disk_r);
 
-			/* TODO power heuristic is not working correct here */
-			eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf;
-			pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf;
-
-			num_bssrdf++;
+			eval_sum += sc->weight * eval;
+			pdf_sum += sample_weight * pdf;
 		}
 	}
 
 	return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f);
-#else
-	float pdf = bssrdf_pdf(pick_sc, r);
-	float disk_pdf = bssrdf_pdf(pick_sc, disk_r);
-
-	return pick_sc->weight * pdf / disk_pdf;
-#endif
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float3 weight, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
-	sd->randb_closure = 0.0f;
 	sd->num_closure = 0;
-	sd->num_closure_extra = 0;
+	sd->num_closure_left = kernel_data.integrator.max_closures;
+
+	Bssrdf *bssrdf = (Bssrdf *)sc;
+#ifdef __PRINCIPLED__
+	if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
+	   bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+	{
+		PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+		if(bsdf) {
+			bsdf->N = N;
+			bsdf->roughness = bssrdf->roughness;
+			sd->flag |= bsdf_principled_diffuse_setup(bsdf);
 
-	if(hit) {
+			/* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+			 * can recognize it as not being a regular Disney principled diffuse closure */
+			bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+		}
+	}
+	else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) ||
+			CLOSURE_IS_BSSRDF(bssrdf->type))
+#endif  /* __PRINCIPLED__ */
+	{
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 		if(bsdf) {
@@ -185,8 +133,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
 
 ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
                                            ShaderData *sd,
-                                           PathState *state,
-                                           int state_flag,
+                                           ccl_addr_space PathState *state,
                                            float3 *eval,
                                            float3 *N)
 {
@@ -199,7 +146,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 
 	if(bump || texture_blur > 0.0f) {
 		/* average color and normal at incoming point */
-		shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS);
+		shader_eval_surface(kg, sd, state, state->flag);
 		float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
 
 		/* we simply divide out the average color and multiply with the average
@@ -218,11 +165,11 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 /* Subsurface scattering step, from a point on the surface to other
  * nearby points on the same object.
  */
-ccl_device_inline int subsurface_scatter_multi_intersect(
+ccl_device_inline int subsurface_scatter_disk(
         KernelGlobals *kg,
-        SubsurfaceIntersection *ss_isect,
+        LocalIntersection *ss_isect,
         ShaderData *sd,
-        ShaderClosure *sc,
+        const ShaderClosure *sc,
         uint *lcg_state,
         float disk_u,
         float disk_v,
@@ -235,26 +182,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	disk_N = sd->Ng;
 	make_orthonormals(disk_N, &disk_T, &disk_B);
 
-	/* reusing variable for picking the closure gives a bit nicer stratification
-	 * for path tracer, for branched we do all closures so it doesn't help */
-	float axisu = (all)? disk_u: sd->randb_closure;
-
-	if(axisu < 0.5f) {
+	if(disk_v < 0.5f) {
 		pick_pdf_N = 0.5f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u *= 2.0f;
+		disk_v *= 2.0f;
 	}
-	else if(axisu < 0.75f) {
+	else if(disk_v < 0.75f) {
 		float3 tmp = disk_N;
 		disk_N = disk_T;
 		disk_T = tmp;
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.5f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u = (disk_u - 0.5f)*4.0f;
+		disk_v = (disk_v - 0.5f)*4.0f;
 	}
 	else {
 		float3 tmp = disk_N;
@@ -263,21 +204,24 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.5f;
-		if(all)
-			disk_u = (disk_u - 0.75f)*4.0f;
+		disk_v = (disk_v - 0.75f)*4.0f;
 	}
 
 	/* sample point on disk */
-	float phi = M_2PI_F * disk_u;
-	float disk_r = disk_v;
-	float disk_height;
+	float phi = M_2PI_F * disk_v;
+	float disk_height, disk_r;
 
-	bssrdf_sample(sc, disk_r, &disk_r, &disk_height);
+	bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
 
 	float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
 
 	/* create ray */
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
 	Ray *ray = &ss_isect->ray;
+#endif
 	ray->P = sd->P + disk_N*disk_height + disk_P;
 	ray->D = -disk_N;
 	ray->t = 2.0f*disk_height;
@@ -287,37 +231,37 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 
 	/* intersect with the same object. if multiple intersections are found it
 	 * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-	scene_intersect_subsurface(kg,
-	                           ray,
-	                           ss_isect,
-	                           sd->object,
-	                           lcg_state,
-	                           BSSRDF_MAX_HITS);
+	scene_intersect_local(kg,
+	                      *ray,
+	                      ss_isect,
+	                      sd->object,
+	                      lcg_state,
+	                      BSSRDF_MAX_HITS);
 	int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
 
 	for(int hit = 0; hit < num_eval_hits; hit++) {
 		/* Quickly retrieve P and Ng without setting up ShaderData. */
 		float3 hit_P;
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
-			hit_P = triangle_refine_subsurface(kg,
-			                                   sd,
-			                                   &ss_isect->hits[hit],
-			                                   ray);
+		if(sd->type & PRIMITIVE_TRIANGLE) {
+			hit_P = triangle_refine_local(kg,
+			                              sd,
+			                              &ss_isect->hits[hit],
+			                              ray);
 		}
 #ifdef __OBJECT_MOTION__
-		else  if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+		else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
 			float3 verts[3];
 			motion_triangle_vertices(
 			        kg,
-			        ccl_fetch(sd, object),
+			        sd->object,
 			        kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-			        ccl_fetch(sd, time),
+			        sd->time,
 			        verts);
-			hit_P = motion_triangle_refine_subsurface(kg,
-			                                          sd,
-			                                          &ss_isect->hits[hit],
-			                                          ray,
-			                                          verts);
+			hit_P = motion_triangle_refine_local(kg,
+			                                     sd,
+			                                     &ss_isect->hits[hit],
+			                                     ray,
+			                                     verts);
 		}
 #endif  /* __OBJECT_MOTION__ */
 		else {
@@ -330,140 +274,262 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 			object_normal_transform(kg, sd, &hit_Ng);
 		}
 
-		/* probability densities for local frame axes */
+		/* Probability densities for local frame axes. */
 		float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
 		float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
 		float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
 
-		/* multiple importance sample between 3 axes, power heuristic
-		 * found to be slightly better than balance heuristic */
-		float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B);
+		/* Multiple importance sample between 3 axes, power heuristic
+		 * found to be slightly better than balance heuristic. pdf_N
+		 * in the MIS weight and denominator cancelled out. */
+		float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
+		if(ss_isect->num_hits > BSSRDF_MAX_HITS) {
+			w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS;
+		}
 
-		/* real distance to sampled point */
+		/* Real distance to sampled point. */
 		float r = len(hit_P - sd->P);
 
-		/* evaluate */
-		float w = mis_weight / pdf_N;
-		if(ss_isect->num_hits > BSSRDF_MAX_HITS)
-			w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS;
+		/* Evaluate profiles. */
 		float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
 
 		ss_isect->weight[hit] = eval;
 	}
 
+#ifdef __SPLIT_KERNEL__
+	ss_isect->ray = *ray;
+#endif
+
 	return num_eval_hits;
 }
 
 ccl_device_noinline void subsurface_scatter_multi_setup(
         KernelGlobals *kg,
-        SubsurfaceIntersection* ss_isect,
+        LocalIntersection* ss_isect,
         int hit,
         ShaderData *sd,
-        PathState *state,
-        int state_flag,
-        ShaderClosure *sc,
-        bool all)
+        ccl_addr_space PathState *state,
+        const ShaderClosure *sc)
 {
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
+	Ray *ray = &ss_isect->ray;
+#endif
+
+	/* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+	kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
+
 	/* Setup new shading point. */
-	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
+	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
 
 	/* Optionally blur colors and bump mapping. */
 	float3 weight = ss_isect->weight[hit];
 	float3 N = sd->N;
-	subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
+	subsurface_color_bump_blur(kg, sd, state, &weight, &N);
 
 	/* Setup diffuse BSDF. */
-	subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+	subsurface_scatter_setup_diffuse_bsdf(kg, sd, sc, weight, N);
 }
 
-/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
-	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+ccl_device void subsurface_random_walk_remap(
+        const float A,
+        const float d,
+        float *sigma_t,
+        float *sigma_s)
 {
-	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
+	/* Compute attenuation and scattering coefficients from albedo. */
+	const float a = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
+	const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
 
-	/* pick random axis in local frame and point on disk */
-	float3 disk_N, disk_T, disk_B;
-	float pick_pdf_N, pick_pdf_T, pick_pdf_B;
+	*sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
+	*sigma_s = *sigma_t * a;
+}
 
-	disk_N = sd->Ng;
-	make_orthonormals(disk_N, &disk_T, &disk_B);
+ccl_device void subsurface_random_walk_coefficients(
+        const ShaderClosure *sc,
+        float3 *sigma_t,
+        float3 *sigma_s,
+        float3 *weight)
+{
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float3 A = bssrdf->albedo;
+	const float3 d = bssrdf->radius;
+	float sigma_t_x, sigma_t_y, sigma_t_z;
+	float sigma_s_x, sigma_s_y, sigma_s_z;
 
-	if(sd->randb_closure < 0.5f) {
-		pick_pdf_N = 0.5f;
-		pick_pdf_T = 0.25f;
-		pick_pdf_B = 0.25f;
-	}
-	else if(sd->randb_closure < 0.75f) {
-		float3 tmp = disk_N;
-		disk_N = disk_T;
-		disk_T = tmp;
-		pick_pdf_N = 0.25f;
-		pick_pdf_T = 0.5f;
-		pick_pdf_B = 0.25f;
-	}
-	else {
-		float3 tmp = disk_N;
-		disk_N = disk_B;
-		disk_B = tmp;
-		pick_pdf_N = 0.25f;
-		pick_pdf_T = 0.25f;
-		pick_pdf_B = 0.5f;
+	subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &sigma_s_x);
+	subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &sigma_s_y);
+	subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &sigma_s_z);
+
+	*sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+	*sigma_s = make_float3(sigma_s_x, sigma_s_y, sigma_s_z);
+
+	/* Closure mixing and Fresnel weights separate from albedo. */
+	*weight = safe_divide_color(bssrdf->weight, A);
+}
+
+ccl_device_noinline bool subsurface_random_walk(
+        KernelGlobals *kg,
+        LocalIntersection *ss_isect,
+        ShaderData *sd,
+        ccl_addr_space PathState *state,
+        const ShaderClosure *sc,
+        const float bssrdf_u,
+        const float bssrdf_v)
+{
+	/* Sample diffuse surface scatter into the object. */
+	float3 D;
+	float pdf;
+	sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
+	if(dot(-sd->Ng, D) <= 0.0f) {
+		return 0;
 	}
 
-	/* sample point on disk */
-	float phi = M_2PI_F * disk_u;
-	float disk_r = disk_v;
-	float disk_height;
+	/* Convert subsurface to volume coefficients. */
+	float3 sigma_t, sigma_s;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	subsurface_random_walk_coefficients(sc, &sigma_t, &sigma_s, &throughput);
+
+	/* Setup ray. */
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
+	Ray *ray = &ss_isect->ray;
+#endif
+	ray->P = ray_offset(sd->P, -sd->Ng);
+	ray->D = D;
+	ray->t = FLT_MAX;
+	ray->time = sd->time;
 
-	bssrdf_sample(sc, disk_r, &disk_r, &disk_height);
+	/* Modify state for RNGs, decorrelated from other paths. */
+	uint prev_rng_offset = state->rng_offset;
+	uint prev_rng_hash = state->rng_hash;
+	state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
 
-	float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
+	/* Random walk until we hit the surface again. */
+	bool hit = false;
 
-	/* create ray */
-	Ray ray;
-	ray.P = sd->P + disk_N*disk_height + disk_P;
-	ray.D = -disk_N;
-	ray.t = 2.0f*disk_height;
-	ray.dP = sd->dP;
-	ray.dD = differential3_zero();
-	ray.time = sd->time;
-
-	/* intersect with the same object. if multiple intersections are
-	 * found it will randomly pick one of them */
-	SubsurfaceIntersection ss_isect;
-	scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
-
-	/* evaluate bssrdf */
-	if(ss_isect.num_hits > 0) {
-		float3 origP = sd->P;
-
-		/* setup new shading point */
-		shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
-
-		/* probability densities for local frame axes */
-		float pdf_N = pick_pdf_N * fabsf(dot(disk_N, sd->Ng));
-		float pdf_T = pick_pdf_T * fabsf(dot(disk_T, sd->Ng));
-		float pdf_B = pick_pdf_B * fabsf(dot(disk_B, sd->Ng));
-
-		/* multiple importance sample between 3 axes, power heuristic
-		 * found to be slightly better than balance heuristic */
-		float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B);
-
-		/* real distance to sampled point */
-		float r = len(sd->P - origP);
-
-		/* evaluate */
-		float w = (mis_weight * ss_isect.num_hits) / pdf_N;
-		eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
+	for(int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+		/* Advance random number offset. */
+		state->rng_offset += PRNG_BOUNCE_NUM;
+
+		if(bounce > 0) {
+			/* Sample scattering direction. */
+			const float anisotropy = 0.0f;
+			float scatter_u, scatter_v;
+			path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+			ray->D = henyey_greenstrein_sample(ray->D, anisotropy, scatter_u, scatter_v, NULL);
+		}
+
+		/* Sample color channel, use MIS with balance heuristic. */
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+		float3 albedo = safe_divide_color(sigma_s, sigma_t);
+		float3 channel_pdf;
+		int channel = kernel_volume_sample_channel(albedo, throughput, rphase, &channel_pdf);
+
+		/* Distance sampling. */
+		float rdist = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+		float t = -logf(1.0f - rdist)/sample_sigma_t;
+
+		ray->t = t;
+		scene_intersect_local(kg, *ray, ss_isect, sd->object, NULL, 1);
+		hit = (ss_isect->num_hits > 0);
+
+		if(hit) {
+			/* Compute world space distance to surface hit. */
+			float3 D = ray->D;
+			object_inverse_dir_transform(kg, sd, &D);
+			D = normalize(D) * ss_isect->hits[0].t;
+			object_dir_transform(kg, sd, &D);
+			t = len(D);
+		}
+
+		/* Advance to new scatter location. */
+		ray->P += t * ray->D;
+
+		/* Update throughput. */
+		float3 transmittance = volume_color_transmittance(sigma_t, t);
+		float pdf = dot(channel_pdf, (hit)? transmittance: sigma_t * transmittance);
+		throughput *= ((hit)? transmittance: sigma_s * transmittance) / pdf;
+
+		if(hit) {
+			/* If we hit the surface, we are done. */
+			break;
+		}
+
+		/* Russian roulette. */
+		float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+		float probability = min(max3(fabs(throughput)), 1.0f);
+		if(terminate >= probability) {
+			break;
+		}
+		throughput /= probability;
 	}
 
-	/* optionally blur colors and bump mapping */
-	float3 N = sd->N;
-	subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
+	kernel_assert(isfinite_safe(throughput.x) &&
+	              isfinite_safe(throughput.y) &&
+	              isfinite_safe(throughput.z));
 
-	/* setup diffuse bsdf */
-	subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+	state->rng_offset = prev_rng_offset;
+	state->rng_hash = prev_rng_hash;
+
+	/* Return number of hits in ss_isect. */
+	if(!hit) {
+		return 0;
+	}
+
+	/* TODO: gain back performance lost from merging with disk BSSRDF. We
+	 * only need to return on hit so this indirect ray push/pop overhead
+	 * is not actually needed, but it does keep the code simpler. */
+	ss_isect->weight[0] = throughput;
+#ifdef __SPLIT_KERNEL__
+	ss_isect->ray = *ray;
+#endif
+
+	return 1;
+}
+
+ccl_device_inline int subsurface_scatter_multi_intersect(
+        KernelGlobals *kg,
+        LocalIntersection *ss_isect,
+        ShaderData *sd,
+        ccl_addr_space PathState *state,
+        const ShaderClosure *sc,
+        uint *lcg_state,
+        float bssrdf_u,
+        float bssrdf_v,
+        bool all)
+{
+	if(CLOSURE_IS_DISK_BSSRDF(sc->type)) {
+		return subsurface_scatter_disk(kg,
+		                               ss_isect,
+		                               sd,
+		                               sc,
+		                               lcg_state,
+		                               bssrdf_u,
+		                               bssrdf_v,
+		                               all);
+	}
+	else {
+		return subsurface_random_walk(kg,
+		                              ss_isect,
+		                              sd,
+		                              state,
+		                              sc,
+		                              bssrdf_u,
+		                              bssrdf_v);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..9047b93a0b2 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -15,184 +15,71 @@
  */
 
 #ifndef KERNEL_TEX
-#  define KERNEL_TEX(type, ttype, name)
-#endif
-
-#ifndef KERNEL_IMAGE_TEX
-#  define KERNEL_IMAGE_TEX(type, ttype, name)
+#  define KERNEL_TEX(type, name)
 #endif
 
 /* bvh */
-KERNEL_TEX(float4, texture_float4, __bvh_nodes)
-KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
-KERNEL_TEX(float4, texture_float4, __prim_tri_verts)
-KERNEL_TEX(uint, texture_uint, __prim_tri_index)
-KERNEL_TEX(uint, texture_uint, __prim_type)
-KERNEL_TEX(uint, texture_uint, __prim_visibility)
-KERNEL_TEX(uint, texture_uint, __prim_index)
-KERNEL_TEX(uint, texture_uint, __prim_object)
-KERNEL_TEX(uint, texture_uint, __object_node)
-KERNEL_TEX(float2, texture_float2, __prim_time)
+KERNEL_TEX(float4, __bvh_nodes)
+KERNEL_TEX(float4, __bvh_leaf_nodes)
+KERNEL_TEX(float4, __prim_tri_verts)
+KERNEL_TEX(uint, __prim_tri_index)
+KERNEL_TEX(uint, __prim_type)
+KERNEL_TEX(uint, __prim_visibility)
+KERNEL_TEX(uint, __prim_index)
+KERNEL_TEX(uint, __prim_object)
+KERNEL_TEX(uint, __object_node)
+KERNEL_TEX(float2, __prim_time)
 
 /* objects */
-KERNEL_TEX(float4, texture_float4, __objects)
-KERNEL_TEX(float4, texture_float4, __objects_vector)
+KERNEL_TEX(KernelObject, __objects)
+KERNEL_TEX(Transform, __object_motion_pass)
+KERNEL_TEX(DecomposedTransform, __object_motion)
+KERNEL_TEX(uint, __object_flag)
+
+/* cameras */
+KERNEL_TEX(DecomposedTransform, __camera_motion)
 
 /* triangles */
-KERNEL_TEX(uint, texture_uint, __tri_shader)
-KERNEL_TEX(float4, texture_float4, __tri_vnormal)
-KERNEL_TEX(uint4, texture_uint4, __tri_vindex)
-KERNEL_TEX(uint, texture_uint, __tri_patch)
-KERNEL_TEX(float2, texture_float2, __tri_patch_uv)
+KERNEL_TEX(uint, __tri_shader)
+KERNEL_TEX(float4, __tri_vnormal)
+KERNEL_TEX(uint4, __tri_vindex)
+KERNEL_TEX(uint, __tri_patch)
+KERNEL_TEX(float2, __tri_patch_uv)
 
 /* curves */
-KERNEL_TEX(float4, texture_float4, __curves)
-KERNEL_TEX(float4, texture_float4, __curve_keys)
+KERNEL_TEX(float4, __curves)
+KERNEL_TEX(float4, __curve_keys)
 
 /* patches */
-KERNEL_TEX(uint, texture_uint, __patches)
+KERNEL_TEX(uint, __patches)
 
 /* attributes */
-KERNEL_TEX(uint4, texture_uint4, __attributes_map)
-KERNEL_TEX(float, texture_float, __attributes_float)
-KERNEL_TEX(float4, texture_float4, __attributes_float3)
-KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4)
+KERNEL_TEX(uint4, __attributes_map)
+KERNEL_TEX(float, __attributes_float)
+KERNEL_TEX(float4, __attributes_float3)
+KERNEL_TEX(uchar4, __attributes_uchar4)
 
 /* lights */
-KERNEL_TEX(float4, texture_float4, __light_distribution)
-KERNEL_TEX(float4, texture_float4, __light_data)
-KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf)
-KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf)
+KERNEL_TEX(KernelLightDistribution, __light_distribution)
+KERNEL_TEX(KernelLight, __lights)
+KERNEL_TEX(float2, __light_background_marginal_cdf)
+KERNEL_TEX(float2, __light_background_conditional_cdf)
 
 /* particles */
-KERNEL_TEX(float4, texture_float4, __particles)
+KERNEL_TEX(KernelParticle, __particles)
 
 /* shaders */
-KERNEL_TEX(uint4, texture_uint4, __svm_nodes)
-KERNEL_TEX(uint, texture_uint, __shader_flag)
-KERNEL_TEX(uint, texture_uint, __object_flag)
+KERNEL_TEX(uint4, __svm_nodes)
+KERNEL_TEX(KernelShader, __shaders)
 
 /* lookup tables */
-KERNEL_TEX(float, texture_float, __lookup_table)
+KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, texture_uint, __sobol_directions)
-
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ < 300
-/* full-float image */
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
-
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
-
-#  else
-/* bindless textures */
-KERNEL_TEX(uint, texture_uint, __bindless_mapping)
-#  endif
-#endif
+KERNEL_TEX(uint, __sobol_directions)
 
-/* packed image (opencl) */
-KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
-KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
-KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
-KERNEL_TEX(float, texture_float, __tex_image_float_packed)
-KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
+/* image textures */
+KERNEL_TEX(TextureInfo, __texture_info)
 
 #undef KERNEL_TEX
-#undef KERNEL_IMAGE_TEX
-
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8250eaf6073..72fbf7be557 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,9 +17,9 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
-#include "kernel_math.h"
-#include "svm/svm_types.h"
-#include "util_static_assert.h"
+#include "kernel/kernel_math.h"
+#include "kernel/svm/svm_types.h"
+#include "util/util_static_assert.h"
 
 #ifndef __KERNEL_GPU__
 #  define __KERNEL_CPU__
@@ -34,18 +34,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* constants */
-#define OBJECT_SIZE 		12
-#define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE		11
-#define FILTER_TABLE_SIZE	1024
-#define RAMP_TABLE_SIZE		256
-#define SHUTTER_TABLE_SIZE		256
-#define PARTICLE_SIZE 		5
-#define SHADER_SIZE		5
+/* Constants */
+#define OBJECT_MOTION_PASS_SIZE 2
+#define FILTER_TABLE_SIZE       1024
+#define RAMP_TABLE_SIZE         256
+#define SHUTTER_TABLE_SIZE      256
 
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
+#define BSSRDF_MAX_BOUNCES			256
+#define LOCAL_MAX_HITS				4
+
+#define VOLUME_BOUNDS_MAX       1024
 
 #define BECKMANN_TABLE_SIZE		256
 
@@ -56,7 +56,28 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE		16
 
-/* device capabilities */
+/* Split kernel constants */
+#define WORK_POOL_SIZE_GPU 64
+#define WORK_POOL_SIZE_CPU 1
+#ifdef __KERNEL_GPU__
+#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
+#else
+#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
+#endif
+
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
+/* Device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
 #    define __QBVH__
@@ -67,24 +88,28 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
+#  define __PRINCIPLED__
 #  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
+#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SUBSURFACE__
-#  define __CMJ__
+#  define __PRINCIPLED__
 #  define __SHADOW_RECORD_ALL__
+#  define __CMJ__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#  endif
 #endif  /* __KERNEL_CUDA__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -94,41 +119,50 @@ CCL_NAMESPACE_BEGIN
 #  ifdef __KERNEL_OPENCL_NVIDIA__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __SUBSURFACE__
+#    define __PRINCIPLED__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_NVIDIA__ */
 
 #  ifdef __KERNEL_OPENCL_APPLE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __PRINCIPLED__
+#    define __CMJ__
 /* TODO(sergey): Currently experimental section is ignored here,
  * this is because megakernel in device_opencl does not support
  * custom cflags depending on the scene features.
  */
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
-#  endif  /* __KERNEL_OPENCL_NVIDIA__ */
+#  endif  /* __KERNEL_OPENCL_APPLE__ */
 
 #  ifdef __KERNEL_OPENCL_AMD__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __PRINCIPLED__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __PRINCIPLED__
+#    define __CMJ__
 #  endif  /* __KERNEL_OPENCL_INTEL_CPU__ */
 
 #endif  /* __KERNEL_OPENCL__ */
 
-/* kernel features */
+/* Kernel features */
 #define __SOBOL__
 #define __INSTANCING__
 #define __DPDU__
@@ -141,6 +175,9 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
+#define __SHADOW_TRICKS__
+#define __DENOISING_FEATURES__
+#define __SHADER_RAYTRACE__
 
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
@@ -163,10 +200,6 @@ CCL_NAMESPACE_BEGIN
 #  define __BAKING__
 #endif
 
-#ifdef WITH_CYCLES_DEBUG
-#  define __KERNEL_DEBUG__
-#endif
-
 /* Scene-based selective features compilation. */
 #ifdef __NO_CAMERA_MOTION__
 #  undef __CAMERA_MOTION__
@@ -196,10 +229,27 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_TRANSPARENT__
 #  undef __TRANSPARENT_SHADOWS__
 #endif
+#ifdef __NO_SHADOW_TRICKS__
+#  undef __SHADOW_TRICKS__
+#endif
+#ifdef __NO_PRINCIPLED__
+#  undef __PRINCIPLED__
+#endif
+#ifdef __NO_DENOISING__
+#  undef __DENOISING_FEATURES__
+#endif
+#ifdef __NO_SHADER_RAYTRACE__
+#  undef __SHADER_RAYTRACE__
+#endif
 
-/* Random Numbers */
+/* Features that enable others */
+#ifdef WITH_CYCLES_DEBUG
+#  define __KERNEL_DEBUG__
+#endif
 
-typedef uint RNG;
+#if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
+#  define __BVH_LOCAL__
+#endif
 
 /* Shader Evaluation */
 
@@ -213,6 +263,7 @@ typedef enum ShaderEvalType {
 	/* data passes */
 	SHADER_EVAL_NORMAL,
 	SHADER_EVAL_UV,
+	SHADER_EVAL_ROUGHNESS,
 	SHADER_EVAL_DIFFUSE_COLOR,
 	SHADER_EVAL_GLOSSY_COLOR,
 	SHADER_EVAL_TRANSMISSION_COLOR,
@@ -240,31 +291,24 @@ enum PathTraceDimension {
 	PRNG_FILTER_V = 1,
 	PRNG_LENS_U = 2,
 	PRNG_LENS_V = 3,
-#ifdef __CAMERA_MOTION__
 	PRNG_TIME = 4,
 	PRNG_UNUSED_0 = 5,
 	PRNG_UNUSED_1 = 6,	/* for some reason (6, 7) is a bad sobol pattern */
 	PRNG_UNUSED_2 = 7,  /* with a low number of samples (< 64) */
-#endif
-	PRNG_BASE_NUM = 8,
+	PRNG_BASE_NUM = 10,
 
 	PRNG_BSDF_U = 0,
 	PRNG_BSDF_V = 1,
-	PRNG_BSDF = 2,
-	PRNG_LIGHT = 3,
-	PRNG_LIGHT_U = 4,
-	PRNG_LIGHT_V = 5,
-	PRNG_LIGHT_TERMINATE = 6,
-	PRNG_TERMINATE = 7,
-
-#ifdef __VOLUME__
-	PRNG_PHASE_U = 8,
-	PRNG_PHASE_V = 9,
-	PRNG_PHASE = 10,
-	PRNG_SCATTER_DISTANCE = 11,
-#endif
-
-	PRNG_BOUNCE_NUM = 12,
+	PRNG_LIGHT_U = 2,
+	PRNG_LIGHT_V = 3,
+	PRNG_LIGHT_TERMINATE = 4,
+	PRNG_TERMINATE = 5,
+	PRNG_PHASE_CHANNEL = 6,
+	PRNG_SCATTER_DISTANCE = 7,
+	PRNG_BOUNCE_NUM = 8,
+
+	PRNG_BEVEL_U = 6, /* reuse volume dimension, correlation won't harm */
+	PRNG_BEVEL_V = 7,
 };
 
 enum SamplingPattern {
@@ -277,29 +321,56 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-	PATH_RAY_CAMERA = 1,
-	PATH_RAY_REFLECT = 2,
-	PATH_RAY_TRANSMIT = 4,
-	PATH_RAY_DIFFUSE = 8,
-	PATH_RAY_GLOSSY = 16,
-	PATH_RAY_SINGULAR = 32,
-	PATH_RAY_TRANSPARENT = 64,
-
-	PATH_RAY_SHADOW_OPAQUE = 128,
-	PATH_RAY_SHADOW_TRANSPARENT = 256,
-	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
-
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+	PATH_RAY_CAMERA              = (1 << 0),
+	PATH_RAY_REFLECT             = (1 << 1),
+	PATH_RAY_TRANSMIT            = (1 << 2),
+	PATH_RAY_DIFFUSE             = (1 << 3),
+	PATH_RAY_GLOSSY              = (1 << 4),
+	PATH_RAY_SINGULAR            = (1 << 5),
+	PATH_RAY_TRANSPARENT         = (1 << 6),
+
+	PATH_RAY_SHADOW_OPAQUE_NON_CATCHER       = (1 << 7),
+	PATH_RAY_SHADOW_OPAQUE_CATCHER           = (1 << 8),
+	PATH_RAY_SHADOW_OPAQUE                   = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER),
+	PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER  = (1 << 9),
+	PATH_RAY_SHADOW_TRANSPARENT_CATCHER      = (1 << 10),
+	PATH_RAY_SHADOW_TRANSPARENT              = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
+	PATH_RAY_SHADOW_NON_CATCHER              = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+	PATH_RAY_SHADOW                          = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
+
+	PATH_RAY_CURVE               = (1 << 11), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 12), /* volume scattering */
 
 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = 2048,
-
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
-
-	PATH_RAY_MIS_SKIP = 4096,
-	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
+	PATH_RAY_NODE_UNALIGNED = (1 << 13),
+
+	PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1),
+
+	/* Don't apply multiple importance sampling weights to emission from
+	 * lamp or surface hits, because they were not direct light sampled. */
+	PATH_RAY_MIS_SKIP                    = (1 << 14),
+	/* Diffuse bounce earlier in the path, skip SSS to improve performance
+	 * and avoid branching twice with disk sampling SSS. */
+	PATH_RAY_DIFFUSE_ANCESTOR            = (1 << 15),
+	/* Single pass has been written. */
+	PATH_RAY_SINGLE_PASS_DONE            = (1 << 16),
+	/* Ray is behind a shadow catcher .*/
+	PATH_RAY_SHADOW_CATCHER              = (1 << 17),
+	/* Store shadow data for shadow catcher or denoising. */
+	PATH_RAY_STORE_SHADOW_INFO           = (1 << 18),
+	/* Zero background alpha, for camera or transparent glass rays. */
+	PATH_RAY_TRANSPARENT_BACKGROUND      = (1 << 19),
+	/* Terminate ray immediately at next bounce. */
+	PATH_RAY_TERMINATE_IMMEDIATE         = (1 << 20),
+	/* Ray is to be terminated, but continue with transparent bounces and
+	 * emission as long as we encounter them. This is required to make the
+	 * MIS between direct and indirect light rays match, as shadow rays go
+	 * through transparent surfaces to reach emisison too. */
+	PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+	/* Ray is to be terminated. */
+	PATH_RAY_TERMINATE                   = (PATH_RAY_TERMINATE_IMMEDIATE|PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+	/* Path and shader is being evaluated for direct lighting emission. */
+	PATH_RAY_EMISSION                    = (1 << 22)
 };
 
 /* Closure Label */
@@ -313,49 +384,82 @@ typedef enum ClosureLabel {
 	LABEL_SINGULAR = 16,
 	LABEL_TRANSPARENT = 32,
 	LABEL_VOLUME_SCATTER = 64,
+	LABEL_TRANSMIT_TRANSPARENT = 128,
 } ClosureLabel;
 
 /* Render Passes */
 
+#define PASS_NAME_JOIN(a, b) a ## _ ## b
+#define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
+
+#define PASSMASK_COMPONENT(comp) (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) |   \
+                                  PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
+                                  PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
+
 typedef enum PassType {
 	PASS_NONE = 0,
-	PASS_COMBINED = (1 << 0),
-	PASS_DEPTH = (1 << 1),
-	PASS_NORMAL = (1 << 2),
-	PASS_UV = (1 << 3),
-	PASS_OBJECT_ID = (1 << 4),
-	PASS_MATERIAL_ID = (1 << 5),
-	PASS_DIFFUSE_COLOR = (1 << 6),
-	PASS_GLOSSY_COLOR = (1 << 7),
-	PASS_TRANSMISSION_COLOR = (1 << 8),
-	PASS_DIFFUSE_INDIRECT = (1 << 9),
-	PASS_GLOSSY_INDIRECT = (1 << 10),
-	PASS_TRANSMISSION_INDIRECT = (1 << 11),
-	PASS_DIFFUSE_DIRECT = (1 << 12),
-	PASS_GLOSSY_DIRECT = (1 << 13),
-	PASS_TRANSMISSION_DIRECT = (1 << 14),
-	PASS_EMISSION = (1 << 15),
-	PASS_BACKGROUND = (1 << 16),
-	PASS_AO = (1 << 17),
-	PASS_SHADOW = (1 << 18),
-	PASS_MOTION = (1 << 19),
-	PASS_MOTION_WEIGHT = (1 << 20),
-	PASS_MIST = (1 << 21),
-	PASS_SUBSURFACE_DIRECT = (1 << 22),
-	PASS_SUBSURFACE_INDIRECT = (1 << 23),
-	PASS_SUBSURFACE_COLOR = (1 << 24),
-	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
+
+	/* Main passes */
+	PASS_COMBINED = 1,
+	PASS_DEPTH,
+	PASS_NORMAL,
+	PASS_UV,
+	PASS_OBJECT_ID,
+	PASS_MATERIAL_ID,
+	PASS_MOTION,
+	PASS_MOTION_WEIGHT,
 #ifdef __KERNEL_DEBUG__
-	PASS_BVH_TRAVERSED_NODES = (1 << 26),
-	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
-	PASS_BVH_INTERSECTIONS = (1 << 28),
-	PASS_RAY_BOUNCES = (1 << 29),
+	PASS_BVH_TRAVERSED_NODES,
+	PASS_BVH_TRAVERSED_INSTANCES,
+	PASS_BVH_INTERSECTIONS,
+	PASS_RAY_BOUNCES,
 #endif
+	PASS_RENDER_TIME,
+	PASS_CATEGORY_MAIN_END = 31,
+
+	PASS_MIST = 32,
+	PASS_EMISSION,
+	PASS_BACKGROUND,
+	PASS_AO,
+	PASS_SHADOW,
+	PASS_LIGHT, /* no real pass, used to force use_light_pass */
+	PASS_DIFFUSE_DIRECT,
+	PASS_DIFFUSE_INDIRECT,
+	PASS_DIFFUSE_COLOR,
+	PASS_GLOSSY_DIRECT,
+	PASS_GLOSSY_INDIRECT,
+	PASS_GLOSSY_COLOR,
+	PASS_TRANSMISSION_DIRECT,
+	PASS_TRANSMISSION_INDIRECT,
+	PASS_TRANSMISSION_COLOR,
+	PASS_SUBSURFACE_DIRECT,
+	PASS_SUBSURFACE_INDIRECT,
+	PASS_SUBSURFACE_COLOR,
+	PASS_VOLUME_DIRECT,
+	PASS_VOLUME_INDIRECT,
+	/* No Scatter color since it's tricky to define what it would even mean. */
+	PASS_CATEGORY_LIGHT_END = 63,
 } PassType;
 
-#define PASS_ALL (~0)
-
-typedef enum BakePassFilter {
+#define PASS_ANY (~0)
+
+typedef enum DenoisingPassOffsets {
+	DENOISING_PASS_NORMAL             = 0,
+	DENOISING_PASS_NORMAL_VAR         = 3,
+	DENOISING_PASS_ALBEDO             = 6,
+	DENOISING_PASS_ALBEDO_VAR         = 9,
+	DENOISING_PASS_DEPTH              = 12,
+	DENOISING_PASS_DEPTH_VAR          = 13,
+	DENOISING_PASS_SHADOW_A           = 14,
+	DENOISING_PASS_SHADOW_B           = 17,
+	DENOISING_PASS_COLOR              = 20,
+	DENOISING_PASS_COLOR_VAR          = 23,
+
+	DENOISING_PASS_SIZE_BASE          = 26,
+	DENOISING_PASS_SIZE_CLEAN         = 3,
+} DenoisingPassOffsets;
+
+typedef enum eBakePassFilter {
 	BAKE_FILTER_NONE = 0,
 	BAKE_FILTER_DIRECT = (1 << 0),
 	BAKE_FILTER_INDIRECT = (1 << 1),
@@ -366,7 +470,7 @@ typedef enum BakePassFilter {
 	BAKE_FILTER_SUBSURFACE = (1 << 6),
 	BAKE_FILTER_EMISSION = (1 << 7),
 	BAKE_FILTER_AO = (1 << 8),
-} BakePassFilter;
+} eBakePassFilter;
 
 typedef enum BakePassFilterCombos {
 	BAKE_FILTER_COMBINED = (
@@ -388,25 +492,60 @@ typedef enum BakePassFilterCombos {
 	BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
+typedef enum DenoiseFlag {
+	DENOISING_CLEAN_DIFFUSE_DIR      = (1 << 0),
+	DENOISING_CLEAN_DIFFUSE_IND      = (1 << 1),
+	DENOISING_CLEAN_GLOSSY_DIR       = (1 << 2),
+	DENOISING_CLEAN_GLOSSY_IND       = (1 << 3),
+	DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+	DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+	DENOISING_CLEAN_SUBSURFACE_DIR   = (1 << 6),
+	DENOISING_CLEAN_SUBSURFACE_IND   = (1 << 7),
+	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
+} DenoiseFlag;
+
+#ifdef __KERNEL_DEBUG__
+/* NOTE: This is a runtime-only struct, alignment is not
+ * really important here.
+ */
+typedef struct DebugData {
+	int num_bvh_traversed_nodes;
+	int num_bvh_traversed_instances;
+	int num_bvh_intersections;
+	int num_ray_bounces;
+} DebugData;
+#endif
+
+typedef ccl_addr_space struct PathRadianceState {
+#ifdef __PASSES__
+	float3 diffuse;
+	float3 glossy;
+	float3 transmission;
+	float3 subsurface;
+	float3 scatter;
+
+	float3 direct;
+#endif
+} PathRadianceState;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
 #endif
 
+	float transparent;
 	float3 emission;
 #ifdef __PASSES__
 	float3 background;
 	float3 ao;
 
 	float3 indirect;
-	float3 direct_throughput;
 	float3 direct_emission;
 
 	float3 color_diffuse;
 	float3 color_glossy;
 	float3 color_transmission;
 	float3 color_subsurface;
-	float3 color_scatter;
 
 	float3 direct_diffuse;
 	float3 direct_glossy;
@@ -420,15 +559,46 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 indirect_subsurface;
 	float3 indirect_scatter;
 
-	float3 path_diffuse;
-	float3 path_glossy;
-	float3 path_transmission;
-	float3 path_subsurface;
-	float3 path_scatter;
-
 	float4 shadow;
 	float mist;
 #endif
+
+	struct PathRadianceState state;
+
+#ifdef __SHADOW_TRICKS__
+	/* Total light reachable across the path, ignoring shadow blocked queries. */
+	float3 path_total;
+	/* Total light reachable across the path with shadow blocked queries
+	 * applied here.
+	 *
+	 * Dividing this figure by path_total will give estimate of shadow pass.
+	 */
+	float3 path_total_shaded;
+
+	/* Color of the background on which shadow is alpha-overed. */
+	float3 shadow_background_color;
+
+	/* Path radiance sum and throughput at the moment when ray hits shadow
+	 * catcher object.
+	 */
+	float shadow_throughput;
+
+	/* Accumulated transparency along the path after shadow catcher bounce. */
+	float shadow_transparency;
+
+	/* Indicate if any shadow catcher data is set. */
+	int has_shadow_catcher;
+#endif
+
+#ifdef __DENOISING_FEATURES__
+	float3 denoising_normal;
+	float3 denoising_albedo;
+	float denoising_depth;
+#endif  /* __DENOISING_FEATURES__ */
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+#endif /* __KERNEL_DEBUG__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -444,6 +614,9 @@ typedef struct BsdfEval {
 	float3 subsurface;
 	float3 scatter;
 #endif
+#ifdef __SHADOW_TRICKS__
+	float3 sum_no_mis;
+#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -537,7 +710,7 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef ccl_addr_space struct Intersection {
+typedef struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
@@ -617,12 +790,14 @@ typedef enum AttributeStandard {
 	ATTR_STD_MOTION_VERTEX_NORMAL,
 	ATTR_STD_PARTICLE,
 	ATTR_STD_CURVE_INTERCEPT,
+	ATTR_STD_CURVE_RANDOM,
 	ATTR_STD_PTEX_FACE_ID,
 	ATTR_STD_PTEX_UV,
 	ATTR_STD_VOLUME_DENSITY,
 	ATTR_STD_VOLUME_COLOR,
 	ATTR_STD_VOLUME_FLAME,
 	ATTR_STD_VOLUME_HEAT,
+	ATTR_STD_VOLUME_TEMPERATURE,
 	ATTR_STD_VOLUME_VELOCITY,
 	ATTR_STD_POINTINESS,
 	ATTR_STD_NUM,
@@ -645,10 +820,14 @@ typedef struct AttributeDescriptor {
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
-#  ifndef __MAX_CLOSURE__
-#     define MAX_CLOSURE 64
+#  ifdef __SPLIT_KERNEL__
+#    define MAX_CLOSURE 1
 #  else
-#    define MAX_CLOSURE __MAX_CLOSURE__
+#    ifndef __MAX_CLOSURE__
+#       define MAX_CLOSURE 64
+#    else
+#      define MAX_CLOSURE __MAX_CLOSURE__
+#    endif
 #  endif
 #else
 #  define MAX_CLOSURE 1
@@ -668,28 +847,15 @@ typedef struct AttributeDescriptor {
 #define SHADER_CLOSURE_BASE \
 	float3 weight; \
 	ClosureType type; \
-	float sample_weight \
+	float sample_weight; \
+	float3 N
 
 typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	SHADER_CLOSURE_BASE;
 
-	float data[14]; /* pad to 80 bytes */
+	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;
 
-/* Shader Context
- *
- * For OSL we recycle a fixed number of contexts for speed */
-
-typedef enum ShaderContext {
-	SHADER_CONTEXT_MAIN = 0,
-	SHADER_CONTEXT_INDIRECT = 1,
-	SHADER_CONTEXT_EMISSION = 2,
-	SHADER_CONTEXT_SHADOW = 3,
-	SHADER_CONTEXT_SSS = 4,
-	SHADER_CONTEXT_VOLUME = 5,
-	SHADER_CONTEXT_NUM = 6
-} ShaderContext;
-
 /* Shader Data
  *
  * Main shader state at a point on the surface or in a volume. All coordinates
@@ -701,7 +867,7 @@ enum ShaderDataFlag {
 
 	/* Set when ray hits backside of surface. */
 	SD_BACKFACING      = (1 << 0),
-	/* Shader has emissive closure. */
+	/* Shader has non-zero emission. */
 	SD_EMISSION        = (1 << 1),
 	/* Shader has BSDF closure. */
 	SD_BSDF            = (1 << 2),
@@ -711,8 +877,8 @@ enum ShaderDataFlag {
 	SD_BSSRDF          = (1 << 4),
 	/* Shader has holdout closure. */
 	SD_HOLDOUT         = (1 << 5),
-	/* Shader has volume absorption closure. */
-	SD_ABSORPTION      = (1 << 6),
+	/* Shader has non-zero volume extinction. */
+	SD_EXTINCTION      = (1 << 6),
 	/* Shader has have volume phase (scatter) closure. */
 	SD_SCATTER         = (1 << 7),
 	/* Shader has AO closure. */
@@ -727,7 +893,7 @@ enum ShaderDataFlag {
 	                    SD_BSDF_HAS_EVAL |
 	                    SD_BSSRDF |
 	                    SD_HOLDOUT |
-	                    SD_ABSORPTION |
+	                    SD_EXTINCTION |
 	                    SD_SCATTER |
 	                    SD_AO |
 	                    SD_BSDF_NEEDS_LCG),
@@ -752,25 +918,28 @@ enum ShaderDataFlag {
 	SD_VOLUME_MIS             = (1 << 23),
 	/* Use cubic interpolation for voxels. */
 	SD_VOLUME_CUBIC           = (1 << 24),
-	/* Has data connected to the displacement input. */
+	/* Has data connected to the displacement input or uses bump map. */
 	SD_HAS_BUMP               = (1 << 25),
 	/* Has true displacement. */
 	SD_HAS_DISPLACEMENT       = (1 << 26),
-	/* Has constant emission (value stored in __shader_flag) */
+	/* Has constant emission (value stored in __shaders) */
 	SD_HAS_CONSTANT_EMISSION  = (1 << 27),
+	/* Needs to access attributes */
+	SD_NEED_ATTRIBUTES        = (1 << 28),
 
 	SD_SHADER_FLAGS = (SD_USE_MIS |
 	                   SD_HAS_TRANSPARENT_SHADOW |
 	                   SD_HAS_VOLUME |
 	                   SD_HAS_ONLY_VOLUME |
-	                   SD_HETEROGENEOUS_VOLUME|
+	                   SD_HETEROGENEOUS_VOLUME |
 	                   SD_HAS_BSSRDF_BUMP |
 	                   SD_VOLUME_EQUIANGULAR |
 	                   SD_VOLUME_MIS |
 	                   SD_VOLUME_CUBIC |
 	                   SD_HAS_BUMP |
 	                   SD_HAS_DISPLACEMENT |
-	                   SD_HAS_CONSTANT_EMISSION)
+	                   SD_HAS_CONSTANT_EMISSION |
+	                   SD_NEED_ATTRIBUTES)
 };
 
 	/* Object flags. */
@@ -789,115 +958,113 @@ enum ShaderDataObjectFlag {
 	SD_OBJECT_INTERSECTS_VOLUME      = (1 << 5),
 	/* Has position for motion vertices. */
 	SD_OBJECT_HAS_VERTEX_MOTION      = (1 << 6),
+	/* object is used to catch shadows */
+	SD_OBJECT_SHADOW_CATCHER         = (1 << 7),
+	/* object has volume attributes */
+	SD_OBJECT_HAS_VOLUME_ATTRIBUTES  = (1 << 8),
 
 	SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK |
 	                   SD_OBJECT_MOTION |
 	                   SD_OBJECT_TRANSFORM_APPLIED |
 	                   SD_OBJECT_NEGATIVE_SCALE_APPLIED |
 	                   SD_OBJECT_HAS_VOLUME |
-	                   SD_OBJECT_INTERSECTS_VOLUME)
+	                   SD_OBJECT_INTERSECTS_VOLUME |
+	                   SD_OBJECT_SHADOW_CATCHER |
+	                   SD_OBJECT_HAS_VOLUME_ATTRIBUTES)
 };
 
-#ifdef __SPLIT_KERNEL__
-#  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-#  if !defined(__SPLIT_KERNEL_SOA__)
-     /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-#  else
-     /* ShaderData is stored as an Structure-of-Arrays */
-#    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-#    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-#    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-#    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-#  endif
-#else
-#  define ccl_soa_member(type, name) type name
-#  define ccl_fetch(s, t) (s->t)
-#  define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
-
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	ccl_soa_member(float3, P);
+	float3 P;
 	/* smooth normal for shading */
-	ccl_soa_member(float3, N);
+	float3 N;
 	/* true geometric normal */
-	ccl_soa_member(float3, Ng);
+	float3 Ng;
 	/* view/incoming direction */
-	ccl_soa_member(float3, I);
+	float3 I;
 	/* shader id */
-	ccl_soa_member(int, shader);
+	int shader;
 	/* booleans describing shader, see ShaderDataFlag */
-	ccl_soa_member(int, flag);
+	int flag;
 	/* booleans describing object of the shader, see ShaderDataObjectFlag */
-	ccl_soa_member(int, object_flag);
+	int object_flag;
 
 	/* primitive id if there is one, ~0 otherwise */
-	ccl_soa_member(int, prim);
+	int prim;
 
 	/* combined type and curve segment for hair */
-	ccl_soa_member(int, type);
+	int type;
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	ccl_soa_member(float, u);
-	ccl_soa_member(float, v);
+	float u;
+	float v;
 	/* object id if there is one, ~0 otherwise */
-	ccl_soa_member(int, object);
+	int object;
+	/* lamp id if there is one, ~0 otherwise */
+	int lamp;
 
 	/* motion blur sample time */
-	ccl_soa_member(float, time);
+	float time;
 
 	/* length of the ray being shaded */
-	ccl_soa_member(float, ray_length);
+	float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	ccl_soa_member(differential3, dP);
+	differential3 dP;
 	/* differential of I */
-	ccl_soa_member(differential3, dI);
+	differential3 dI;
 	/* differential of u, v */
-	ccl_soa_member(differential, du);
-	ccl_soa_member(differential, dv);
+	differential du;
+	differential dv;
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	ccl_soa_member(float3, dPdu);
-	ccl_soa_member(float3, dPdv);
+	float3 dPdu;
+	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	ccl_soa_member(Transform, ob_tfm);
-	ccl_soa_member(Transform, ob_itfm);
+	Transform ob_tfm;
+	Transform ob_itfm;
 #endif
 
-	/* Closure data, we store a fixed array of closures */
-	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
-	ccl_soa_member(int, num_closure);
-	ccl_soa_member(int, num_closure_extra);
-	ccl_soa_member(float, randb_closure);
-	ccl_soa_member(float3, svm_closure_weight);
-
-	/* LCG state for closures that require additional random numbers. */
-	ccl_soa_member(uint, lcg_state);
-
 	/* ray start position, only set for backgrounds */
-	ccl_soa_member(float3, ray_P);
-	ccl_soa_member(differential3, ray_dP);
+	float3 ray_P;
+	differential3 ray_dP;
 
 #ifdef __OSL__
 	struct KernelGlobals *osl_globals;
 	struct PathState *osl_path_state;
 #endif
+
+	/* LCG state for closures that require additional random numbers. */
+	uint lcg_state;
+
+	/* Closure data, we store a fixed array of closures */
+	int num_closure;
+	int num_closure_left;
+	float randb_closure;
+	float3 svm_closure_weight;
+
+	/* Closure weights summed directly, so we can evaluate
+	 * emission and shadow transparency with MAX_CLOSURE 0. */
+	float3 closure_emission_background;
+	float3 closure_transparent_extinction;
+
+	/* At the end so we can adjust size in ShaderDataTinyStorage. */
+	struct ShaderClosure closure[MAX_CLOSURE];
 } ShaderData;
 
+typedef ccl_addr_space struct ShaderDataTinyStorage {
+	char pad[sizeof(ShaderData) - sizeof(ShaderClosure) * MAX_CLOSURE];
+} ShaderDataTinyStorage;
+#define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData*)shader_data_tiny_storage)
+
 /* Path State */
 
 #ifdef __VOLUME__
@@ -912,9 +1079,11 @@ typedef struct PathState {
 	int flag;
 
 	/* random number generator state */
-	int rng_offset;    		/* dimension offset */
-	int sample;        		/* path sample number */
-	int num_samples;		/* total number of times this path will be sampled */
+	uint rng_hash;          /* per pixel hash */
+	int rng_offset;         /* dimension offset */
+	int sample;             /* path sample number */
+	int num_samples;        /* total number of times this path will be sampled */
+	float branch_factor;    /* number of branches in indirect paths */
 
 	/* bounce counting */
 	int bounce;
@@ -923,6 +1092,10 @@ typedef struct PathState {
 	int transmission_bounce;
 	int transparent_bounce;
 
+#ifdef __DENOISING_FEATURES__
+	float denoising_feature_weight;
+#endif  /* __DENOISING_FEATURES__ */
+
 	/* multiple importance sampling */
 	float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
 	float ray_pdf;     /* last bounce pdf */
@@ -933,37 +1106,34 @@ typedef struct PathState {
 	/* volume rendering */
 #ifdef __VOLUME__
 	int volume_bounce;
-	RNG rng_congruential;
+	int volume_bounds_bounce;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
 } PathState;
 
-/* Subsurface */
-
-/* Struct to gather multiple SSS hits. */
-struct SubsurfaceIntersection
-{
+/* Struct to gather multiple nearby intersections. */
+typedef struct LocalIntersection {
 	Ray ray;
-	float3 weight[BSSRDF_MAX_HITS];
+	float3 weight[LOCAL_MAX_HITS];
 
 	int num_hits;
-	struct Intersection hits[BSSRDF_MAX_HITS];
-	float3 Ng[BSSRDF_MAX_HITS];
-};
+	struct Intersection hits[LOCAL_MAX_HITS];
+	float3 Ng[LOCAL_MAX_HITS];
+} LocalIntersection;
+
+/* Subsurface */
 
 /* Struct to gather SSS indirect rays and delay tracing them. */
-struct SubsurfaceIndirectRays
-{
-	bool need_update_volume_stack;
-	bool tracing;
+typedef struct SubsurfaceIndirectRays {
 	PathState state[BSSRDF_MAX_HITS];
-	struct PathRadiance direct_L;
 
 	int num_rays;
+
 	struct Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
-	struct PathRadiance L[BSSRDF_MAX_HITS];
-};
+	struct PathRadianceState L_state[BSSRDF_MAX_HITS];
+} SubsurfaceIndirectRays;
+static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
 
 /* Constant Kernel Data
  *
@@ -989,7 +1159,7 @@ typedef struct KernelCamera {
 
 	/* matrices */
 	Transform cameratoworld;
-	Transform rastertocamera;
+	ProjectionTransform rastertocamera;
 
 	/* differentials */
 	float4 dx;
@@ -1003,7 +1173,7 @@ typedef struct KernelCamera {
 
 	/* motion blur */
 	float shuttertime;
-	int have_motion, have_perspective_motion;
+	int num_motion_steps, have_perspective_motion;
 
 	/* clipping */
 	float nearclip;
@@ -1023,22 +1193,22 @@ typedef struct KernelCamera {
 	int is_inside_volume;
 
 	/* more matrices */
-	Transform screentoworld;
-	Transform rastertoworld;
-	/* work around cuda sm 2.0 crash, this seems to
-	 * cross some limit in combination with motion 
-	 * Transform ndctoworld; */
-	Transform worldtoscreen;
-	Transform worldtoraster;
-	Transform worldtondc;
+	ProjectionTransform screentoworld;
+	ProjectionTransform rastertoworld;
+	ProjectionTransform ndctoworld;
+	ProjectionTransform worldtoscreen;
+	ProjectionTransform worldtoraster;
+	ProjectionTransform worldtondc;
 	Transform worldtocamera;
 
-	MotionTransform motion;
+	/* Stores changes in the projeciton matrix. Use for camera zoom motion
+	 * blur and motion pass output for perspective camera. */
+	ProjectionTransform perspective_pre;
+	ProjectionTransform perspective_post;
 
-	/* Denotes changes in the projective matrix, namely in rastertocamera.
-	 * Used for camera zoom motion blur,
-	 */
-	PerspectiveMotionTransform perspective_motion;
+	/* Transforms for motion pass. */
+	Transform motion_pass_pre;
+	Transform motion_pass_post;
 
 	int shutter_table_offset;
 
@@ -1053,6 +1223,7 @@ static_assert_align(KernelCamera, 16);
 typedef struct KernelFilm {
 	float exposure;
 	int pass_flag;
+	int light_pass_flag;
 	int pass_stride;
 	int use_light_pass;
 
@@ -1075,11 +1246,13 @@ typedef struct KernelFilm {
 	int pass_glossy_indirect;
 	int pass_transmission_indirect;
 	int pass_subsurface_indirect;
+	int pass_volume_indirect;
 	
 	int pass_diffuse_direct;
 	int pass_glossy_direct;
 	int pass_transmission_direct;
 	int pass_subsurface_direct;
+	int pass_volume_direct;
 	
 	int pass_emission;
 	int pass_background;
@@ -1089,13 +1262,18 @@ typedef struct KernelFilm {
 	int pass_shadow;
 	float pass_shadow_scale;
 	int filter_table_offset;
-	int pass_pad2;
 
 	int pass_mist;
 	float mist_start;
 	float mist_inv_depth;
 	float mist_falloff;
 
+	int pass_denoising_data;
+	int pass_denoising_clean;
+	int denoising_flags;
+
+	int pad1, pad2, pad3;
+
 #ifdef __KERNEL_DEBUG__
 	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
@@ -1110,12 +1288,13 @@ typedef struct KernelBackground {
 	int surface_shader;
 	int volume_shader;
 	int transparent;
-	int pad;
+	float transparent_roughness_squared_threshold;
 
 	/* ambient occlusion */
 	float ao_factor;
 	float ao_distance;
-	float ao_pad1, ao_pad2;
+	float ao_bounces_factor;
+	float ao_pad;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
@@ -1127,8 +1306,8 @@ typedef struct KernelIntegrator {
 	int num_all_lights;
 	float pdf_triangles;
 	float pdf_lights;
-	float inv_pdf_lights;
 	int pdf_background_res;
+	float light_inv_rr_threshold;
 
 	/* light portals */
 	float portal_pdf;
@@ -1136,7 +1315,6 @@ typedef struct KernelIntegrator {
 	int portal_offset;
 
 	/* bounces */
-	int min_bounce;
 	int max_bounce;
 
 	int max_diffuse_bounce;
@@ -1147,7 +1325,6 @@ typedef struct KernelIntegrator {
 	int ao_bounces;
 
 	/* transparent */
-	int transparent_min_bounce;
 	int transparent_max_bounce;
 	int transparent_shadows;
 
@@ -1165,6 +1342,7 @@ typedef struct KernelIntegrator {
 
 	/* branched path */
 	int branched;
+	int volume_decoupled;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
@@ -1187,23 +1365,31 @@ typedef struct KernelIntegrator {
 	float volume_step_size;
 	int volume_samples;
 
-	float light_inv_rr_threshold;
-
 	int start_sample;
-	int pad1, pad2, pad3;
+
+	int max_closures;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
+typedef enum KernelBVHLayout {
+	BVH_LAYOUT_NONE = 0,
+
+	BVH_LAYOUT_BVH2 = (1 << 0),
+	BVH_LAYOUT_BVH4 = (1 << 1),
+
+	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
+	BVH_LAYOUT_ALL = (unsigned int)(-1),
+} KernelBVHLayout;
+
 typedef struct KernelBVH {
 	/* root node */
 	int root;
-	int attributes_map_stride;
 	int have_motion;
 	int have_curves;
 	int have_instancing;
-	int use_qbvh;
+	int bvh_layout;
 	int use_bvh_steps;
-	int pad1;
+	int pad1, pad2;
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1244,17 +1430,113 @@ typedef struct KernelData {
 } KernelData;
 static_assert_align(KernelData, 16);
 
-#ifdef __KERNEL_DEBUG__
-/* NOTE: This is a runtime-only struct, alignment is not
- * really important here.
- */
-typedef ccl_addr_space struct DebugData {
-	int num_bvh_traversed_nodes;
-	int num_bvh_traversed_instances;
-	int num_bvh_intersections;
-	int num_ray_bounces;
-} DebugData;
-#endif
+/* Kernel data structures. */
+
+typedef struct KernelObject {
+	Transform tfm;
+	Transform itfm;
+
+	float surface_area;
+	float pass_id;
+	float random_number;
+	int particle_index;
+
+	float dupli_generated[3];
+	float dupli_uv[2];
+
+	int numkeys;
+	int numsteps;
+	int numverts;
+
+	uint patch_map_offset;
+	uint attribute_map_offset;
+	uint motion_offset;
+	uint pad;
+} KernelObject;
+static_assert_align(KernelObject, 16);
+
+typedef struct KernelSpotLight {
+	float radius;
+	float invarea;
+	float spot_angle;
+	float spot_smooth;
+	float dir[3];
+	float pad;
+} KernelSpotLight;
+
+/* PointLight is SpotLight with only radius and invarea being used. */
+
+typedef struct KernelAreaLight {
+	float axisu[3];
+	float invarea;
+	float axisv[3];
+	float pad1;
+	float dir[3];
+	float pad2;
+} KernelAreaLight;
+
+typedef struct KernelDistantLight {
+	float radius;
+	float cosangle;
+	float invarea;
+	float pad;
+} KernelDistantLight;
+
+typedef struct KernelLight {
+	int type;
+	float co[3];
+	int shader_id;
+	int samples;
+	float max_bounces;
+	float random;
+	Transform tfm;
+	Transform itfm;
+	union {
+		KernelSpotLight spot;
+		KernelAreaLight area;
+		KernelDistantLight distant;
+	};
+} KernelLight;
+static_assert_align(KernelLight, 16);
+
+typedef struct KernelLightDistribution {
+	float totarea;
+	int prim;
+	union {
+		struct {
+			int shader_flag;
+			int object_id;
+		} mesh_light;
+		struct {
+			float pad;
+			float size;
+		} lamp;
+	};
+} KernelLightDistribution;
+static_assert_align(KernelLightDistribution, 16);
+
+typedef struct KernelParticle {
+	int index;
+	float age;
+	float lifetime;
+	float size;
+	float4 rotation;
+	/* Only xyz are used of the following. float4 instead of float3 are used
+	 * to ensure consistent padding/alignment across devices. */
+	float4 location;
+	float4 velocity;
+	float4 angular_velocity;
+} KernelParticle;
+static_assert_align(KernelParticle, 16);
+
+typedef struct KernelShader {
+	float constant_emission[3];
+	float pad1;
+	int flags;
+	int pass_id;
+	int pad2, pad3;
+} KernelShader;
+static_assert_align(KernelShader, 16);
 
 /* Declarations required for split kernel */
 
@@ -1268,7 +1550,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1281,45 +1562,77 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+	/* Rays sorted according to shader->id */
+	QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+	/* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+	QUEUE_INACTIVE_RAYS,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
+	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
-	RAY_ACTIVE = 0,
+	RAY_ACTIVE,
 	/* Denotes ray has completed processing all samples and is inactive. */
-	RAY_INACTIVE = 1,
-	/* Denoted ray has exited path-iteration and needs to update output buffer. */
-	RAY_UPDATE_BUFFER = 2,
+	RAY_INACTIVE,
+	/* Denotes ray has exited path-iteration and needs to update output buffer. */
+	RAY_UPDATE_BUFFER,
+	/* Denotes ray needs to skip most surface shader work. */
+	RAY_HAS_ONLY_VOLUME,
 	/* Donotes ray has hit background */
-	RAY_HIT_BACKGROUND = 3,
+	RAY_HIT_BACKGROUND,
 	/* Denotes ray has to be regenerated */
-	RAY_TO_REGENERATE = 4,
+	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
-	RAY_REGENERATED = 5,
-	/* Denotes ray should skip direct lighting */
-	RAY_SKIP_DL = 6,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	RAY_REGENERATED,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+	/* Ray is evaluating an iteration of an indirect loop for another thread */
+	RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
@@ -1334,6 +1647,20 @@ enum RayState {
 #define PATCH_MAP_NODE_IS_LEAF (1u << 31)
 #define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF))
 
+/* Work Tiles */
+
+typedef struct WorkTile {
+	uint x, y, w, h;
+
+	uint start_sample;
+	uint num_samples;
+
+	uint offset;
+	uint stride;
+
+	ccl_global float *buffer;
+} WorkTile;
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c7cb29b5af2..86378289b02 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -30,78 +30,65 @@ typedef enum VolumeIntegrateResult {
  * sigma_t = sigma_a + sigma_s */
 
 typedef struct VolumeShaderCoefficients {
-	float3 sigma_a;
+	float3 sigma_t;
 	float3 sigma_s;
 	float3 emission;
 } VolumeShaderCoefficients;
 
+#ifdef __VOLUME__
+
 /* evaluate shader to get extinction coefficient at P */
 ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       PathState *state,
+                                                       ccl_addr_space PathState *state,
                                                        float3 P,
                                                        float3 *extinction)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
 
-	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER)))
+	if(sd->flag & SD_EXTINCTION) {
+		*extinction = sd->closure_transparent_extinction;
+		return true;
+	}
+	else {
 		return false;
-
-	float3 sigma_t = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
-
-		if(CLOSURE_IS_VOLUME(sc->type))
-			sigma_t += sc->weight;
 	}
-
-	*extinction = sigma_t;
-	return true;
 }
 
 /* evaluate shader to get absorption, scattering and emission at P */
 ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             ShaderData *sd,
-                                            PathState *state,
+                                            ccl_addr_space PathState *state,
                                             float3 P,
                                             VolumeShaderCoefficients *coeff)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
+	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
 
-	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION)))
+	if(!(sd->flag & (SD_EXTINCTION|SD_SCATTER|SD_EMISSION)))
 		return false;
 	
-	coeff->sigma_a = make_float3(0.0f, 0.0f, 0.0f);
 	coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-	coeff->emission = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
-
-		if(sc->type == CLOSURE_VOLUME_ABSORPTION_ID)
-			coeff->sigma_a += sc->weight;
-		else if(sc->type == CLOSURE_EMISSION_ID)
-			coeff->emission += sc->weight;
-		else if(CLOSURE_IS_VOLUME(sc->type))
-			coeff->sigma_s += sc->weight;
-	}
+	coeff->sigma_t = (sd->flag & SD_EXTINCTION)? sd->closure_transparent_extinction:
+	                                             make_float3(0.0f, 0.0f, 0.0f);
+	coeff->emission = (sd->flag & SD_EMISSION)? sd->closure_emission_background:
+	                                            make_float3(0.0f, 0.0f, 0.0f);
 
-	/* when at the max number of bounces, treat scattering as absorption */
 	if(sd->flag & SD_SCATTER) {
-		if(state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
-			coeff->sigma_a += coeff->sigma_s;
-			coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-			sd->flag &= ~SD_SCATTER;
-			sd->flag |= SD_ABSORPTION;
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_VOLUME(sc->type))
+				coeff->sigma_s += sc->weight;
 		}
 	}
 
 	return true;
 }
 
+#endif /* __VOLUME__ */
+
 ccl_device float3 volume_color_transmittance(float3 sigma, float t)
 {
 	return make_float3(expf(-sigma.x * t), expf(-sigma.y * t), expf(-sigma.z * t));
@@ -112,13 +99,28 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 	return (channel == 0)? value.x: ((channel == 1)? value.y: value.z);
 }
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
+#ifdef __VOLUME__
+
+ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
-		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
+		int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
 
-		if(shader_flag & SD_HETEROGENEOUS_VOLUME)
+		if(shader_flag & SD_HETEROGENEOUS_VOLUME) {
 			return true;
+		}
+		else if(shader_flag & SD_NEED_ATTRIBUTES) {
+			/* We want to render world or objects without any volume grids
+			 * as homogenous, but can only verify this at runtime since other
+			 * heterogenous volume objects may be using the same shader. */
+			int object = stack[i].object;
+			if(object != OBJECT_NONE) {
+				int object_flag = kernel_tex_fetch(__object_flag, object);
+				if(object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+					return true;
+				}
+			}
+		}
 	}
 
 	return false;
@@ -132,7 +134,7 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 	int method = -1;
 
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
-		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
+		int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
 
 		if(shader_flag & SD_VOLUME_MIS) {
 			return SD_VOLUME_MIS;
@@ -154,6 +156,24 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 	return method;
 }
 
+ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
+                                               ccl_addr_space PathState *state,
+                                               float t,
+                                               float *step_size,
+                                               float *step_offset)
+{
+	const int max_steps = kernel_data.integrator.volume_max_steps;
+	float step = min(kernel_data.integrator.volume_step_size, t);
+
+	/* compute exact steps in advance for malloc */
+	if(t > max_steps * step) {
+		step = t / (float)max_steps;
+	}
+
+	*step_size = step;
+	*step_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4) * step;
+}
+
 /* Volume Shadows
  *
  * These functions are used to attenuate shadow rays to lights. Both absorption
@@ -161,7 +181,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 /* homogeneous volume: assume shader evaluation at the starts gives
  * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
+                                                 ccl_addr_space PathState *state,
+                                                 Ray *ray,
+                                                 ShaderData *sd,
+                                                 float3 *throughput)
 {
 	float3 sigma_t;
 
@@ -171,15 +195,19 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
+                                                   ccl_addr_space PathState *state,
+                                                   Ray *ray,
+                                                   ShaderData *sd,
+                                                   float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
-	float step = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step;
+	float step_offset, step_size;
+	kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
 
 	/* compute extinction at the start */
 	float t = 0.0f;
@@ -188,14 +216,15 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
-		float new_t = min(ray->t, (i+1) * step);
-		float dt = new_t - t;
+		float new_t = min(ray->t, (i+1) * step_size);
 
-		/* use random position inside this segment to sample shader */
-		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+		/* use random position inside this segment to sample shader, adjust
+		 * for last step that is shorter than other steps. */
+		if(new_t == ray->t) {
+			step_offset *= (new_t - t) / step_size;
+		}
 
-		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		float3 new_P = ray->P + ray->D * (t + step_offset);
 		float3 sigma_t;
 
 		/* compute attenuation over segment */
@@ -227,7 +256,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput)
+ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
+                                              ShaderData *shadow_sd,
+                                              ccl_addr_space PathState *state,
+                                              Ray *ray,
+                                              float3 *throughput)
 {
 	shader_setup_from_volume(kg, shadow_sd, ray);
 
@@ -237,6 +270,8 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *sha
 		kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
 }
 
+#endif /* __VOLUME__ */
+
 /* Equi-angular sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media" */
 
@@ -324,8 +359,8 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 	 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
 	float3 emission = coeff->emission;
 
-	if(closure_flag & SD_ABSORPTION) {
-		float3 sigma_t = coeff->sigma_a + coeff->sigma_s;
+	if(closure_flag & SD_EXTINCTION) {
+		float3 sigma_t = coeff->sigma_t;
 
 		emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: t;
 		emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: t;
@@ -339,11 +374,46 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 
 /* Volume Path */
 
+ccl_device int kernel_volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+	/* Sample color channel proportional to throughput and single scattering
+	 * albedo, to significantly reduce noise with many bounce, following:
+	 *
+	 * "Practical and Controllable Subsurface Scattering for Production Path
+	 *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+	float3 weights = fabs(throughput * albedo);
+	float sum_weights = weights.x + weights.y + weights.z;
+
+	if(sum_weights > 0.0f) {
+		*pdf = weights/sum_weights;
+	}
+	else {
+		*pdf = make_float3(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f);
+	}
+
+	if(rand < pdf->x) {
+		return 0;
+	}
+	else if(rand < pdf->x + pdf->y) {
+		return 1;
+	}
+	else {
+		return 2;
+	}
+}
+
+#ifdef __VOLUME__
+
 /* homogeneous volume: assume shader evaluation at the start gives
  * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng, bool probalistic_scatter)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -357,21 +427,18 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 #ifdef __VOLUME_SCATTER__
 	/* randomly scatter, and if we do t is shortened */
 	if(closure_flag & SD_SCATTER) {
-		/* extinction coefficient */
-		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
-
-		/* pick random color channel, we use the Veach one-sample
-		 * model with balance heuristic for the channels */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-		int channel = (int)(rphase*3.0f);
-		sd->randb_closure = rphase*3.0f - channel;
+		/* Sample channel, use MIS with balance heuristic. */
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+		float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+		float3 channel_pdf;
+		int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
 
 		/* decide if we will hit or miss */
 		bool scatter = true;
-		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		if(probalistic_scatter) {
-			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+			float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
 			float sample_transmittance = expf(-sample_sigma_t * t);
 
 			if(1.0f - xi >= sample_transmittance) {
@@ -392,40 +459,39 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			float sample_t;
 
 			/* distance sampling */
-			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
+			sample_t = kernel_volume_distance_sample(ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
 
 			/* modify pdf for hit/miss decision */
 			if(probalistic_scatter)
-				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(coeff.sigma_t, t);
 
-			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
+			new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
 			t = sample_t;
 		}
 		else {
 			/* no scattering */
-			float3 transmittance = volume_color_transmittance(sigma_t, t);
-			float pdf = average(transmittance);
+			float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
+			float pdf = dot(channel_pdf, transmittance);
 			new_tp = *throughput * transmittance / pdf;
 		}
 	}
 	else 
 #endif
-	if(closure_flag & SD_ABSORPTION) {
+	if(closure_flag & SD_EXTINCTION) {
 		/* absorption only, no sampling needed */
-		float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
+		float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
 		new_tp = *throughput * transmittance;
 	}
 
 	/* integrate emission attenuated by extinction */
 	if(L && (closure_flag & SD_EMISSION)) {
-		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
-		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
+		float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
-		path_radiance_accum_emission(L, *throughput, emission, state->bounce);
+		path_radiance_accum_emission(L, state, *throughput, emission);
 	}
 
 	/* modify throughput */
-	if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) {
+	if(closure_flag & SD_EXTINCTION) {
 		*throughput = new_tp;
 
 		/* prepare to scatter to new direction */
@@ -444,16 +510,21 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
  * volume until we reach the end, get absorbed entirely, or run out of
  * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
-	float step_size = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+	float step_offset, step_size;
+	kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
 
 	/* compute coefficients at the start */
 	float t = 0.0f;
@@ -461,10 +532,8 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-	int channel = (int)(rphase*3.0f);
-	sd->randb_closure = rphase*3.0f - channel;
+	float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
@@ -472,11 +541,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 		float new_t = min(ray->t, (i+1) * step_size);
 		float dt = new_t - t;
 
-		/* use random position inside this segment to sample shader */
-		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+		/* use random position inside this segment to sample shader,
+		* for last shorter step we remap it to fit within the segment. */
+		if(new_t == ray->t) {
+			step_offset *= (new_t - t) / step_size;
+		}
 
-		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		float3 new_P = ray->P + ray->D * (t + step_offset);
 		VolumeShaderCoefficients coeff;
 
 		/* compute segment */
@@ -488,35 +559,37 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 			/* distance sampling */
 #ifdef __VOLUME_SCATTER__
-			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
+			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
 				has_scatter = true;
 
-				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
-				float3 sigma_s = coeff.sigma_s;
+				/* Sample channel, use MIS with balance heuristic. */
+				float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+				float3 channel_pdf;
+				int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
 
 				/* compute transmittance over full step */
-				transmittance = volume_color_transmittance(sigma_t, dt);
+				transmittance = volume_color_transmittance(coeff.sigma_t, dt);
 
 				/* decide if we will scatter or continue */
 				float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
 
 				if(1.0f - xi >= sample_transmittance) {
 					/* compute sampling distance */
-					float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+					float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
 					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
 					new_t = t + new_dt;
 
 					/* transmittance and pdf */
-					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
-					float3 pdf = sigma_t * new_transmittance;
+					float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+					float3 pdf = coeff.sigma_t * new_transmittance;
 
 					/* throughput */
-					new_tp = tp * sigma_s * new_transmittance / average(pdf);
+					new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
 					scatter = true;
 				}
 				else {
 					/* throughput */
-					float pdf = average(transmittance);
+					float pdf = dot(channel_pdf, transmittance);
 					new_tp = tp * transmittance / pdf;
 
 					/* remap xi so we can reuse it and keep thing stratified */
@@ -525,22 +598,20 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 			}
 			else 
 #endif
-			if(closure_flag & SD_ABSORPTION) {
+			if(closure_flag & SD_EXTINCTION) {
 				/* absorption only, no sampling needed */
-				float3 sigma_a = coeff.sigma_a;
-
-				transmittance = volume_color_transmittance(sigma_a, dt);
+				transmittance = volume_color_transmittance(coeff.sigma_t, dt);
 				new_tp = tp * transmittance;
 			}
 
 			/* integrate emission attenuated by absorption */
 			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
-				path_radiance_accum_emission(L, tp, emission, state->bounce);
+				path_radiance_accum_emission(L, state, tp, emission);
 			}
 
 			/* modify throughput */
-			if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) {
+			if(closure_flag & SD_EXTINCTION) {
 				tp = new_tp;
 
 				/* stop if nearly all light blocked */
@@ -579,17 +650,24 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    ShaderData *sd,
+    Ray *ray,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    bool heterogeneous)
 {
 	shader_setup_from_volume(kg, sd, ray);
 
 	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng);
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
@@ -618,6 +696,7 @@ typedef struct VolumeSegment {
 
 	float3 accum_emission;		/* accumulated emission at end of segment */
 	float3 accum_transmittance;	/* accumulated transmittance at end of segment */
+	float3 accum_albedo;        /* accumulated average albedo over segment */
 
 	int sampling_method;		/* volume sampling method */
 } VolumeSegment;
@@ -628,6 +707,7 @@ typedef struct VolumeSegment {
  * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
@@ -635,19 +715,12 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 	/* prepare for volume stepping */
 	int max_steps;
-	float step_size, random_jitter_offset;
+	float step_size, step_offset;
 
 	if(heterogeneous) {
-		const int global_max_steps = kernel_data.integrator.volume_max_steps;
-		step_size = kernel_data.integrator.volume_step_size;
-		/* compute exact steps in advance for malloc */
-		if(ray->t > global_max_steps*step_size) {
-			max_steps = global_max_steps;
-			step_size = ray->t / (float)max_steps;
-		}
-		else {
-			max_steps = max((int)ceilf(ray->t/step_size), 1);
-		}
+		max_steps = kernel_data.integrator.volume_max_steps;
+		kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+
 #ifdef __KERNEL_CPU__
 		/* NOTE: For the branched path tracing it's possible to have direct
 		 * and indirect light integration both having volume segments allocated.
@@ -664,25 +737,25 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		               sizeof(*kg->decoupled_volume_steps));
 		if(kg->decoupled_volume_steps[index] == NULL) {
 			kg->decoupled_volume_steps[index] =
-			        (VolumeStep*)malloc(sizeof(VolumeStep)*global_max_steps);
+			        (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 		}
 		segment->steps = kg->decoupled_volume_steps[index];
 		++kg->decoupled_volume_steps_index;
 #else
 		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 #endif
-		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
 	}
 	else {
 		max_steps = 1;
 		step_size = ray->t;
-		random_jitter_offset = 0.0f;
+		step_offset = 0.0f;
 		segment->steps = &segment->stack_step;
 	}
 	
 	/* init accumulation variables */
 	float3 accum_emission = make_float3(0.0f, 0.0f, 0.0f);
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
+	float3 accum_albedo = make_float3(0.0f, 0.0f, 0.0f);
 	float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 	float t = 0.0f;
 
@@ -697,17 +770,24 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		float new_t = min(ray->t, (i+1) * step_size);
 		float dt = new_t - t;
 
-		/* use random position inside this segment to sample shader */
-		if(heterogeneous && new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+		/* use random position inside this segment to sample shader,
+		* for last shorter step we remap it to fit within the segment. */
+		if(new_t == ray->t) {
+			step_offset *= (new_t - t) / step_size;
+		}
 
-		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		float3 new_P = ray->P + ray->D * (t + step_offset);
 		VolumeShaderCoefficients coeff;
 
 		/* compute segment */
 		if(volume_shader_sample(kg, sd, state, new_P, &coeff)) {
 			int closure_flag = sd->flag;
-			float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
+			float3 sigma_t = coeff.sigma_t;
+
+			/* compute average albedo for channel sampling */
+			if(closure_flag & SD_SCATTER) {
+				accum_albedo += dt * safe_divide_color(coeff.sigma_s, sigma_t);
+			}
 
 			/* compute accumulated transmittance */
 			float3 transmittance = volume_color_transmittance(sigma_t, dt);
@@ -753,7 +833,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step->accum_transmittance = accum_transmittance;
 		step->cdf_distance = cdf_distance;
 		step->t = new_t;
-		step->shade_t = t + random_jitter_offset;
+		step->shade_t = t + step_offset;
 
 		/* stop if at the end of the volume */
 		t = new_t;
@@ -768,6 +848,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	/* store total emission and transmittance */
 	segment->accum_emission = accum_emission;
 	segment->accum_transmittance = accum_transmittance;
+	segment->accum_albedo = accum_albedo;
 
 	/* normalize cumulative density function for distance sampling */
 	VolumeStep *last_step = segment->steps + segment->numsteps - 1;
@@ -797,6 +878,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
 #endif
 	}
 }
+#endif  /* __VOLUME_DECOUPLED__ */
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
  * marching.
@@ -809,10 +891,13 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 {
 	kernel_assert(segment->closure_flag & SD_SCATTER);
 
-	/* pick random color channel, we use the Veach one-sample
-	 * model with balance heuristic for the channels */
-	int channel = (int)(rphase*3.0f);
-	sd->randb_closure = rphase*3.0f - channel;
+	/* Sample color channel, use MIS with balance heuristic. */
+	float3 channel_pdf;
+	int channel = kernel_volume_sample_channel(segment->accum_albedo,
+	                                           *throughput,
+	                                           rphase,
+	                                           &channel_pdf);
+
 	float xi = rscatter;
 
 	/* probabilistic scattering decision based on transmittance */
@@ -899,7 +984,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		if(probalistic_scatter)
 			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
 
-		pdf = average(distance_pdf * step_pdf_distance);
+		pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
 
 		/* multiple importance sampling */
 		if(use_mis) {
@@ -962,12 +1047,12 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		/* multiple importance sampling */
 		if(use_mis) {
 			float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
-			float distance_pdf = average(distance_pdf3 * step_pdf_distance);
+			float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
-	if(sample_t < 1e-6f) {
-		return VOLUME_PATH_SCATTERED;
+	if(sample_t < 0.0f || pdf == 0.0f) {
+		return VOLUME_PATH_MISSED;
 	}
 
 	/* compute transmittance up to this step */
@@ -990,6 +1075,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
+#endif /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -997,6 +1083,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 	/* decoupled ray marching for heterogeneous volumes not supported on the GPU,
 	 * which also means equiangular and multiple importance sampling is not
 	 * support for that case */
+	if(!kernel_data.integrator.volume_decoupled)
+		return false;
+
 #ifdef __KERNEL_GPU__
 	if(heterogeneous)
 		return false;
@@ -1021,9 +1110,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 
 ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          ShaderData *stack_sd,
-                                         const PathState *state,
-                                         const Ray *ray,
-                                         VolumeStack *stack)
+                                         ccl_addr_space const PathState *state,
+                                         ccl_addr_space const Ray *ray,
+                                         ccl_addr_space VolumeStack *stack)
 {
 	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
@@ -1166,7 +1255,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	}
 }
 
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
+ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack)
 {
 	/* todo: we should have some way for objects to indicate if they want the
 	 * world shader to work inside them. excluding it by default is problematic
@@ -1215,7 +1304,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
                                                           ShaderData *stack_sd,
                                                           Ray *ray,
-                                                          VolumeStack *stack)
+                                                          ccl_addr_space VolumeStack *stack)
 {
 	kernel_assert(kernel_data.integrator.use_volumes);
 
@@ -1277,7 +1366,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
  * the world's one after the last bounce to avoid render artifacts.
  */
 ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 VolumeStack *volume_stack)
+                                                 ccl_addr_space VolumeStack *volume_stack)
 {
 	if(kernel_data.background.volume_shader != SHADER_NONE) {
 		/* Keep the world's volume in stack. */
@@ -1288,4 +1377,6 @@ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
 	}
 }
 
+#endif /* __VOLUME__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..0c2d9379b63 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,66 @@
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+#ifdef __SPLIT_KERNEL__
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-	if(dim == 0) {
-		uint x_span = ray_index % (tile_dim_x * parallel_samples);
-		return x_span / get_local_size(0);
+	/* With a small amount of work there may be more threads than work due to
+	 * rounding up of global size, stop such threads immediately. */
+	if(ray_index >= total_work_size) {
+		return false;
 	}
-	else /*if(dim == 1)*/ {
-		kernel_assert(dim == 1);
-		uint y_span = ray_index / (tile_dim_x * parallel_samples);
-		return y_span / get_local_size(1);
-	}
-}
-
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
-{
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return threads_within_tile_border_x *
-	       threads_within_tile_border_y *
-	       num_samples;
-}
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
-{
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
-	                                 tile_dim_y,
-	                                 grp_idx,
-	                                 grp_idy,
-	                                 num_samples);
-	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-	*my_work = atomic_inc(&work_pool[group_index]);
-	return (*my_work < total_work) ? 1 : 0;
-}
+	/* Increase atomic work index counter in pool. */
+	uint pool = ray_index / WORK_POOL_SIZE;
+	uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
-{
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
+	/* Map per-pool work index to a global work index. */
+	uint global_size = ccl_global_size(0) * ccl_global_size(1);
+	kernel_assert(global_size % WORK_POOL_SIZE == 0);
+	kernel_assert(ray_index < global_size);
 
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
+	*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+	                   + (pool * WORK_POOL_SIZE)
+	                   + (work_index % WORK_POOL_SIZE);
 
-	return my_work /
-	       (threads_within_tile_border_x * threads_within_tile_border_y);
+	/* Test if all work for this pool is done. */
+	return (*global_work_index < total_work_size);
 }
+#endif
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
-                             uint ray_index)
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	uint total_associated_pixels =
-		threads_within_tile_border_x * threads_within_tile_border_y;
-	uint work_group_pixel_index = my_work % total_associated_pixels;
-	uint work_group_pixel_x =
-		work_group_pixel_index % threads_within_tile_border_x;
-	uint work_group_pixel_y =
-		work_group_pixel_index / threads_within_tile_border_x;
-
-	*pixel_x =
-		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-	*pixel_y =
-		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-	*tile_x = *pixel_x - tile_offset_x;
-	*tile_y = *pixel_y - tile_offset_y;
+	uint tile_pixels = tile->w * tile->h;
+	uint sample_offset = global_work_index / tile_pixels;
+	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+	uint y_offset = pixel_offset / tile->w;
+	uint x_offset = pixel_offset - y_offset * tile->w;
+
+	*x = tile->x + x_offset;
+	*y = tile->y + y_offset;
+	*sample = tile->start_sample + sample_offset;
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..4231aba88d7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int* rect,
+                                                           int stride,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int* rect,
+                                                int stride,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int* rect,
+                                                       int stride,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int* rect,
+                                                         int stride,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_window,
+                                                             int stride,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int* rect,
+                                                     int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..504622ecfd9
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+	kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int *rect,
+                                                           int stride,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int *rect,
+                                                int stride,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int *rect,
+                                                       int stride,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int *rect,
+                                                         int stride,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_window,
+                                                             int stride,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+	kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int *rect,
+                                                     int stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..254025be4e2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 72dbbd9a416..de487f6123f 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -56,9 +56,9 @@
     /* do nothing */
 #endif
 
-#include "kernel.h"
+#include "kernel/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel_cpu_impl.h"
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -74,122 +74,21 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s
 
 void kernel_tex_copy(KernelGlobals *kg,
                      const char *name,
-                     device_ptr mem,
-                     size_t width,
-                     size_t height,
-                     size_t depth,
-                     InterpolationType interpolation,
-                     ExtensionType extension)
+                     void *mem,
+                     size_t size)
 {
 	if(0) {
 	}
 
-#define KERNEL_TEX(type, ttype, tname) \
+#define KERNEL_TEX(type, tname) \
 	else if(strcmp(name, #tname) == 0) { \
 		kg->tname.data = (type*)mem; \
-		kg->tname.width = width; \
+		kg->tname.width = size; \
 	}
-#define KERNEL_IMAGE_TEX(type, ttype, tname)
-#include "kernel_textures.h"
-
-	else if(strstr(name, "__tex_image_float4")) {
-		texture_image_float4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_float4_"));
-		int array_index = id;
-
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) {
-			tex = &kg->texture_float4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (float4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_float")) {
-		texture_image_float *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_float_"));
-		int array_index = id - TEX_START_FLOAT_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) {
-			tex = &kg->texture_float_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (float*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_byte4")) {
-		texture_image_uchar4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_byte4_"));
-		int array_index = id - TEX_START_BYTE4_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) {
-			tex = &kg->texture_byte4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (uchar4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_byte")) {
-		texture_image_uchar *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_byte_"));
-		int array_index = id - TEX_START_BYTE_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) {
-			tex = &kg->texture_byte_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (uchar*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_half4")) {
-		texture_image_half4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_half4_"));
-		int array_index = id - TEX_START_HALF4_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) {
-			tex = &kg->texture_half4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (half4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_half")) {
-		texture_image_half *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_half_"));
-		int array_index = id - TEX_START_HALF_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) {
-			tex = &kg->texture_half_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (half*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else
+#include "kernel/kernel_textures.h"
+	else {
 		assert(0);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 1350d9e5c2e..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 1a416e771ee..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..6bdb8546a24 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -18,7 +18,6 @@
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            float *buffer,
-                                           unsigned int *rng_state,
                                            int sample,
                                            int x, int y,
                                            int offset,
@@ -42,11 +41,50 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
-                                       float *output_luma,
                                        int type,
                                        int filter,
                                        int i,
                                        int offset,
                                        int sample);
 
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index af68907a5c2..56c38d8101c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -17,62 +17,503 @@
 #ifndef __KERNEL_CPU_IMAGE_H__
 #define __KERNEL_CPU_IMAGE_H__
 
-#ifdef __KERNEL_CPU__
-
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
-{
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y);
-	else
-		return kg->texture_float4_images[tex].interp(x, y);
-}
+template<typename T> struct TextureInterpolator  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+	static ccl_always_inline float4 read(float4 r)
+	{
+		return r;
+	}
+
+	static ccl_always_inline float4 read(uchar4 r)
+	{
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+
+	static ccl_always_inline float4 read(uchar r)
+	{
+		float f = r*(1.0f/255.0f);
+		return make_float4(f, f, f, 1.0f);
+	}
+
+	static ccl_always_inline float4 read(float r)
+	{
+		/* TODO(dingto): Optimize this, so interpolation
+		 * happens on float instead of float4 */
+		return make_float4(r, r, r, 1.0f);
+	}
+
+	static ccl_always_inline float4 read(half4 r)
+	{
+		return half4_to_float4(r);
+	}
+
+	static ccl_always_inline float4 read(half r)
+	{
+		float f = half_to_float(r);
+		return make_float4(f, f, f, 1.0f);
+	}
+
+	static ccl_always_inline float4 read(const T *data,
+	                                     int x, int y,
+	                                     int width, int height)
+	{
+		if(x < 0 || y < 0 || x >= width || y >= height) {
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		return read(data[y * width + x]);
+	}
+
+	static ccl_always_inline int wrap_periodic(int x, int width)
+	{
+		x %= width;
+		if(x < 0)
+			x += width;
+		return x;
+	}
+
+	static ccl_always_inline int wrap_clamp(int x, int width)
+	{
+		return clamp(x, 0, width-1);
+	}
+
+	static ccl_always_inline float frac(float x, int *ix)
+	{
+		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+		*ix = i;
+		return x - (float)i;
+	}
+
+	/* ********  2D interpolation ******** */
+
+	static ccl_always_inline float4 interp_closest(const TextureInfo& info,
+	                                               float x, float y)
+	{
+		const T *data = (const T*)info.data;
+		const int width = info.width;
+		const int height = info.height;
+		int ix, iy;
+		frac(x*(float)width, &ix);
+		frac(y*(float)height, &iy);
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		return read(data[ix + iy*width]);
+	}
+
+	static ccl_always_inline float4 interp_linear(const TextureInfo& info,
+	                                              float x, float y)
+	{
+		const T *data = (const T*)info.data;
+		const int width = info.width;
+		const int height = info.height;
+		int ix, iy, nix, niy;
+		const float tx = frac(x*(float)width - 0.5f, &ix);
+		const float ty = frac(y*(float)height - 0.5f, &iy);
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				break;
+			case EXTENSION_CLIP:
+				nix = ix + 1;
+				niy = iy + 1;
+				break;
+			case EXTENSION_EXTEND:
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+		       (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+		       ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+		       ty * tx * read(data, nix, niy, width, height);
+	}
+
+	static ccl_always_inline float4 interp_cubic(const TextureInfo& info,
+	                                             float x, float y)
+	{
+		const T *data = (const T*)info.data;
+		const int width = info.width;
+		const int height = info.height;
+		int ix, iy, nix, niy;
+		const float tx = frac(x*(float)width - 0.5f, &ix);
+		const float ty = frac(y*(float)height - 0.5f, &iy);
+		int pix, piy, nnix, nniy;
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				pix = wrap_periodic(ix-1, width);
+				piy = wrap_periodic(iy-1, height);
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				nnix = wrap_periodic(ix+2, width);
+				nniy = wrap_periodic(iy+2, height);
+				break;
+			case EXTENSION_CLIP:
+				pix = ix - 1;
+				piy = iy - 1;
+				nix = ix + 1;
+				niy = iy + 1;
+				nnix = ix + 2;
+				nniy = iy + 2;
+				break;
+			case EXTENSION_EXTEND:
+				pix = wrap_clamp(ix-1, width);
+				piy = wrap_clamp(iy-1, height);
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				nnix = wrap_clamp(ix+2, width);
+				nniy = wrap_clamp(iy+2, height);
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		const int xc[4] = {pix, ix, nix, nnix};
+		const int yc[4] = {piy, iy, niy, nniy};
+		float u[4], v[4];
+		/* Some helper macro to keep code reasonable size,
+		 * let compiler to inline all the matrix multiplications.
+		 */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+		(v[col] * (u[0] * DATA(0, col) + \
+		           u[1] * DATA(1, col) + \
+		           u[2] * DATA(2, col) + \
+		           u[3] * DATA(3, col)))
+
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+		/* Actual interpolation. */
+		return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+	}
+
+	static ccl_always_inline float4 interp(const TextureInfo& info,
+	                                       float x, float y)
+	{
+		if(UNLIKELY(!info.data)) {
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		switch(info.interpolation) {
+			case INTERPOLATION_CLOSEST:
+				return interp_closest(info, x, y);
+			case INTERPOLATION_LINEAR:
+				return interp_linear(info, x, y);
+			default:
+				return interp_cubic(info, x, y);
+		}
+	}
+
+	/* ********  3D interpolation ******** */
+
+	static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info,
+	                                                  float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
+
+		frac(x*(float)width, &ix);
+		frac(y*(float)height, &iy);
+		frac(z*(float)depth, &iz);
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const T *data = (const T*)info.data;
+		return read(data[ix + iy*width + iz*width*height]);
+	}
+
+	static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info,
+	                                                 float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
+		int nix, niy, niz;
 
-ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
+		float tx = frac(x*(float)width - 0.5f, &ix);
+		float ty = frac(y*(float)height - 0.5f, &iy);
+		float tz = frac(z*(float)depth - 0.5f, &iz);
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const T *data = (const T*)info.data;
+		float4 r;
+
+		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
+		r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
+		r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
+		r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
+
+		r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
+		r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
+		r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
+		r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
+
+		return r;
+	}
+
+	/* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+	 * causing stack overflow issue in this function unless it is inlined.
+	 *
+	 * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+	 * enabled.
+	 */
+#ifdef __GNUC__
+	static ccl_always_inline
+#else
+	static ccl_never_inline
+#endif
+	float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
+		int nix, niy, niz;
+		/* Tricubic b-spline interpolation. */
+		const float tx = frac(x*(float)width - 0.5f, &ix);
+		const float ty = frac(y*(float)height - 0.5f, &iy);
+		const float tz = frac(z*(float)depth - 0.5f, &iz);
+		int pix, piy, piz, nnix, nniy, nniz;
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				pix = wrap_periodic(ix-1, width);
+				piy = wrap_periodic(iy-1, height);
+				piz = wrap_periodic(iz-1, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+
+				nnix = wrap_periodic(ix+2, width);
+				nniy = wrap_periodic(iy+2, height);
+				nniz = wrap_periodic(iz+2, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				pix = wrap_clamp(ix-1, width);
+				piy = wrap_clamp(iy-1, height);
+				piz = wrap_clamp(iz-1, depth);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				nnix = wrap_clamp(ix+2, width);
+				nniy = wrap_clamp(iy+2, height);
+				nniz = wrap_clamp(iz+2, depth);
+
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const int xc[4] = {pix, ix, nix, nnix};
+		const int yc[4] = {width * piy,
+		                   width * iy,
+		                   width * niy,
+		                   width * nniy};
+		const int zc[4] = {width * height * piz,
+		                   width * height * iz,
+		                   width * height * niz,
+		                   width * height * nniz};
+		float u[4], v[4], w[4];
+
+		/* Some helper macro to keep code reasonable size,
+		 * let compiler to inline all the matrix multiplications.
+		 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+		(v[col] * (u[0] * DATA(0, col, row) + \
+		           u[1] * DATA(1, col, row) + \
+		           u[2] * DATA(2, col, row) + \
+		           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+		(w[row] * (COL_TERM(0, row) + \
+		           COL_TERM(1, row) + \
+		           COL_TERM(2, row) + \
+		           COL_TERM(3, row)))
+
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+		SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+		/* Actual interpolation. */
+		const T *data = (const T*)info.data;
+		return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+	}
+
+	static ccl_always_inline float4 interp_3d(const TextureInfo& info,
+	                                          float x, float y, float z,
+	                                          InterpolationType interp)
+	{
+		if(UNLIKELY(!info.data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) {
+			case INTERPOLATION_CLOSEST:
+				return interp_3d_closest(info, x, y, z);
+			case INTERPOLATION_LINEAR:
+				return interp_3d_linear(info, x, y, z);
+			default:
+				return interp_3d_tricubic(info, x, y, z);
+		}
+	}
+#undef SET_CUBIC_SPLINE_WEIGHTS
+};
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z);
-	else
-		return kg->texture_float4_images[tex].interp_3d(x, y, z);
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
 
+	switch(kernel_tex_type(id)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return TextureInterpolator<half>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_BYTE:
+			return TextureInterpolator<uchar>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return TextureInterpolator<float>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_HALF4:
+			return TextureInterpolator<half4>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return TextureInterpolator<uchar4>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return TextureInterpolator<float4>::interp(info, x, y);
+	}
 }
 
-ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else
-		return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation);
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+
+	switch(kernel_tex_type(id)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_BYTE:
+			return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_HALF4:
+			return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
+	}
 }
 
 CCL_NAMESPACE_END
 
-#endif  // __KERNEL_CPU__
-
-
 #endif // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..ccca023a15f 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,43 +20,83 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
-#include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+#include "kernel/kernel_compat_cpu.h"
+
+#ifndef KERNEL_STUB
+#  ifndef __SPLIT_KERNEL__
+#    include "kernel/kernel_math.h"
+#    include "kernel/kernel_types.h"
+
+#    include "kernel/split/kernel_split_data.h"
+#    include "kernel/kernel_globals.h"
+
+#    include "kernel/kernels/cpu/kernel_cpu_image.h"
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_path.h"
+#    include "kernel/kernel_path_branched.h"
+#    include "kernel/kernel_bake.h"
+#  else
+#    include "kernel/split/kernel_split_common.h"
+
+#    include "kernel/split/kernel_data_init.h"
+#    include "kernel/split/kernel_path_init.h"
+#    include "kernel/split/kernel_scene_intersect.h"
+#    include "kernel/split/kernel_lamp_emission.h"
+#    include "kernel/split/kernel_do_volume.h"
+#    include "kernel/split/kernel_queue_enqueue.h"
+#    include "kernel/split/kernel_indirect_background.h"
+#    include "kernel/split/kernel_shader_setup.h"
+#    include "kernel/split/kernel_shader_sort.h"
+#    include "kernel/split/kernel_shader_eval.h"
+#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#    include "kernel/split/kernel_subsurface_scatter.h"
+#    include "kernel/split/kernel_direct_lighting.h"
+#    include "kernel/split/kernel_shadow_blocked_ao.h"
+#    include "kernel/split/kernel_shadow_blocked_dl.h"
+#    include "kernel/split/kernel_enqueue_inactive.h"
+#    include "kernel/split/kernel_next_iteration_setup.h"
+#    include "kernel/split/kernel_indirect_subsurface.h"
+#    include "kernel/split/kernel_buffer_update.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#else
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+
+#  ifdef __SPLIT_KERNEL__
+#    include "kernel/split/kernel_data_init.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* KERNEL_STUB */
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __SPLIT_KERNEL__
+
 /* Path Tracing */
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            float *buffer,
-                                           unsigned int *rng_state,
                                            int sample,
                                            int x, int y,
                                            int offset,
                                            int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
-		                           rng_state,
 		                           sample,
 		                           x, y,
 		                           offset,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }
 
 /* Film */
@@ -69,6 +109,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                 int offset,
                                                 int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -76,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -86,6 +130,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                       int offset,
                                                       int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -93,6 +140,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -100,16 +148,17 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
-                                       float *output_luma,
                                        int type,
                                        int filter,
                                        int i,
                                        int offset,
                                        int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
-		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -118,17 +167,70 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
+	}
+	else if(type == SHADER_EVAL_DISPLACE) {
+		kernel_displace_evaluate(kg, input, output, i);
 	}
 	else {
-		kernel_shader_evaluate(kg,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type,
-		                       i,
-		                       sample);
+		kernel_background_evaluate(kg, input, output, i);
 	}
+#endif /* KERNEL_STUB */
 }
 
+#else  /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		kernel_##name(kg); \
+	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(kg, &locals); \
+	}
+#endif /* KERNEL_STUB */
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+#endif  /* __SPLIT_KERNEL__ */
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..ca750e5a00d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..6ba3425a343
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..76b2d77ebb8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..b468b6f44c8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..3e5792d0b17
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..3629f21cd29
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index a5f2d6e7294..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index 86f9ce991f8..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index c174406047d..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..035f0484488
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+                                   float *variance,
+                                   float *depth,
+                                   float *output,
+                                   int4 prefilter_rect,
+                                   int pass_stride)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
+                                       const float *ccl_restrict variance_image,
+                                       float *difference_image,
+                                       int w,
+                                       int h,
+                                       int stride,
+                                       int shift_stride,
+                                       int r,
+                                       int channel_offset,
+                                       float a,
+                                       float k_2)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+		                                  weight_image,
+		                                  variance_image,
+		                                  difference_image + ofs,
+		                                  rect, stride,
+		                                  channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
+                            float *out_image,
+                            int w,
+                            int h,
+                            int stride,
+                            int shift_stride,
+                            int r,
+                            int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_blur(co.x, co.y,
+		                       difference_image + ofs,
+		                       out_image + ofs,
+		                       rect, stride, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                   float *out_image,
+                                   int w,
+                                   int h,
+                                   int stride,
+                                   int shift_stride,
+                                   int r,
+                                   int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_weight(co.x, co.y,
+		                              difference_image + ofs,
+		                              out_image + ofs,
+		                              rect, stride, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
+                                     const float *ccl_restrict image,
+                                     float *out_image,
+                                     float *accum_image,
+                                     int w,
+                                     int h,
+                                     int stride,
+                                     int shift_stride,
+                                     int r,
+                                     int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+		                                difference_image + ofs,
+		                                image,
+		                                out_image,
+		                                accum_image,
+		                                rect, stride, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *out_image,
+                                 const float *ccl_restrict accum_image,
+                                 int w,
+                                 int h,
+                                 int stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < w && y < h) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
+                                         const float *ccl_restrict buffer,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 filter_window,
+                                         int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
+                                         int f,
+                                         int pass_stride)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+		kernel_filter_nlm_construct_gramian(co.x, co.y,
+		                                    co.z, co.w,
+		                                    difference_image + ofs,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_window,
+		                                    stride, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(float *buffer,
+                            int *rank,
+                            float *XtWX,
+                            float3 *XtWY,
+                            int4 filter_area,
+                            int4 buffer_params,
+                            int sample)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, buffer, rank,
+		                       filter_area.z*filter_area.w,
+		                       XtWX, XtWY,
+		                       buffer_params, sample);
+	}
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 090ab2c50c2..3c93e00ccf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,137 +16,52 @@
 
 /* CUDA kernel entry points */
 
-#include "../../kernel_compat_cuda.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_film.h"
-#include "../../kernel_path.h"
-#include "../../kernel_path_branched.h"
-#include "../../kernel_bake.h"
-
-/* device data taken from CUDA occupancy calculator */
-
 #ifdef __CUDA_ARCH__
 
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
 
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
+#include "util/util_atomic.h"
 
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernels/cuda/kernel_cuda_image.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_work_stealing.h"
 
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+	int work_index = ccl_global_id(0);
+
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	if(x < sx + sw && y < sy + sh) {
 		KernelGlobals kg;
-		kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
 }
 
 #ifdef __BRANCHED_PATH__
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+	int work_index = ccl_global_id(0);
+
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	if(x < sx + sw && y < sy + sh) {
 		KernelGlobals kg;
-		kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
 }
 #endif
@@ -177,26 +92,37 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input,
-                   float4 *output,
-                   float *output_luma,
-                   int type,
-                   int sx,
-                   int sw,
-                   int offset,
-                   int sample)
+kernel_cuda_displace(uint4 *input,
+                     float4 *output,
+                     int type,
+                     int sx,
+                     int sw,
+                     int offset,
+                     int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw) {
+		KernelGlobals kg;
+		kernel_displace_evaluate(&kg, input, output, x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_background(uint4 *input,
+                       float4 *output,
+                       int type,
+                       int sx,
+                       int sw,
+                       int offset,
+                       int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw) {
 		KernelGlobals kg;
-		kernel_shader_evaluate(&kg,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type, 
-		                       x,
-		                       sample);
+		kernel_background_evaluate(&kg, input, output, x);
 	}
 }
 
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..f3d0d721c5c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of registers */
+#  if __CUDACC_VER_MAJOR__ == 9 && __CUDA_ARCH__ >= 600
+#    define CUDA_KERNEL_MAX_REGISTERS 64
+#  else
+#    define CUDA_KERNEL_MAX_REGISTERS 48
+#  endif
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
new file mode 100644
index 00000000000..91ad289a858
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
+ccl_device float cubic_w0(float a)
+{
+	return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f);
+}
+
+ccl_device float cubic_w1(float a)
+{
+	return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f);
+}
+
+ccl_device float cubic_w2(float a)
+{
+	return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f);
+}
+
+ccl_device float cubic_w3(float a)
+{
+	return (1.0f/6.0f)*(a*a*a);
+}
+
+/* g0 and g1 are the two amplitude functions. */
+ccl_device float cubic_g0(float a)
+{
+	return cubic_w0(a) + cubic_w1(a);
+}
+
+ccl_device float cubic_g1(float a)
+{
+	return cubic_w2(a) + cubic_w3(a);
+}
+
+/* h0 and h1 are the two offset functions */
+ccl_device float cubic_h0(float a)
+{
+	/* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+	return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f;
+}
+
+ccl_device float cubic_h1(float a)
+{
+	return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f;
+}
+
+/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
+template<typename T>
+ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y)
+{
+	x = (x * info.width) - 0.5f;
+	y = (y * info.height) - 0.5f;
+
+	float px = floor(x);
+	float py = floor(y);
+	float fx = x - px;
+	float fy = y - py;
+
+	float g0x = cubic_g0(fx);
+	float g1x = cubic_g1(fx);
+	float x0 = (px + cubic_h0(fx)) / info.width;
+	float x1 = (px + cubic_h1(fx)) / info.width;
+	float y0 = (py + cubic_h0(fy)) / info.height;
+	float y1 = (py + cubic_h1(fy)) / info.height;
+
+	return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) +
+	                       g1x * tex2D<T>(tex, x1, y0)) +
+	       cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) +
+	                       g1x * tex2D<T>(tex, x1, y1));
+}
+
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
+template<typename T>
+ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
+{
+	x = (x * info.width) - 0.5f;
+	y = (y * info.height) - 0.5f;
+	z = (z * info.depth) - 0.5f;
+
+	float px = floor(x);
+	float py = floor(y);
+	float pz = floor(z);
+	float fx = x - px;
+	float fy = y - py;
+	float fz = z - pz;
+
+	float g0x = cubic_g0(fx);
+	float g1x = cubic_g1(fx);
+	float g0y = cubic_g0(fy);
+	float g1y = cubic_g1(fy);
+	float g0z = cubic_g0(fz);
+	float g1z = cubic_g1(fz);
+
+	float x0 = (px + cubic_h0(fx)) / info.width;
+	float x1 = (px + cubic_h1(fx)) / info.width;
+	float y0 = (py + cubic_h0(fy)) / info.height;
+	float y1 = (py + cubic_h1(fy)) / info.height;
+	float z0 = (pz + cubic_h0(fz)) / info.depth;
+	float z1 = (pz + cubic_h1(fz)) / info.depth;
+
+	return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) +
+	                     g1x * tex3D<T>(tex, x1, y0, z0)) +
+	              g1y * (g0x * tex3D<T>(tex, x0, y1, z0) +
+	                     g1x * tex3D<T>(tex, x1, y1, z0))) +
+	       g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) +
+	                     g1x * tex3D<T>(tex, x1, y0, z1)) +
+	              g1y * (g0x * tex3D<T>(tex, x0, y1, z1) +
+	                     g1x * tex3D<T>(tex, x1, y1, z1)));
+}
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+	CUtexObject tex = (CUtexObject)info.data;
+
+	/* float4, byte4 and half4 */
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
+		if(info.interpolation == INTERPOLATION_CUBIC) {
+			return kernel_tex_image_interp_bicubic<float4>(info, tex, x, y);
+		}
+		else {
+			return tex2D<float4>(tex, x, y);
+		}
+	}
+	/* float, byte and half */
+	else {
+		float f;
+
+		if(info.interpolation == INTERPOLATION_CUBIC) {
+			f = kernel_tex_image_interp_bicubic<float>(info, tex, x, y);
+		}
+		else {
+			f = tex2D<float>(tex, x, y);
+		}
+
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+{
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+	CUtexObject tex = (CUtexObject)info.data;
+	uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp;
+
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
+		if(interpolation == INTERPOLATION_CUBIC) {
+			return kernel_tex_image_interp_bicubic_3d<float4>(info, tex, x, y, z);
+		}
+		else {
+			return tex3D<float4>(tex, x, y, z);
+		}
+	}
+	else {
+		float f;
+
+		if(interpolation == INTERPOLATION_CUBIC) {
+			f = kernel_tex_image_interp_bicubic_3d<float>(info, tex, x, y, z);
+		}
+		else {
+			f = tex3D<float>(tex, x, y, z);
+		}
+
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..43b3d0aa0e6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
+#include "kernel/split/kernel_path_init.h"
+#include "kernel/split/kernel_scene_intersect.h"
+#include "kernel/split/kernel_lamp_emission.h"
+#include "kernel/split/kernel_do_volume.h"
+#include "kernel/split/kernel_queue_enqueue.h"
+#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
+#include "kernel/split/kernel_shader_eval.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+#include "kernel/split/kernel_direct_lighting.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#include "kernel/kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
+{
+	*size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(NULL, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..2b77807c38b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset);
+	}
+}
+
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+                                                ccl_global float *variance,
+                                                ccl_global float *depth,
+                                                ccl_global float *output,
+                                                int4 prefilter_rect,
+                                                int pass_stride)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
+                                                    const ccl_global float *ccl_restrict variance_image,
+                                                    ccl_global float *difference_image,
+                                                    int w,
+                                                    int h,
+                                                    int stride,
+                                                    int shift_stride,
+                                                    int r,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+		                                  weight_image,
+		                                  variance_image,
+		                                  difference_image + ofs,
+		                                  rect, stride,
+		                                  channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+                                         ccl_global float *out_image,
+                                         int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
+                                         int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_blur(co.x, co.y,
+		                       difference_image + ofs,
+		                       out_image + ofs,
+		                       rect, stride, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+                                                ccl_global float *out_image,
+                                                int w,
+                                                int h,
+                                                int stride,
+                                                int shift_stride,
+                                                int r,
+                                                int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_weight(co.x, co.y,
+		                              difference_image + ofs,
+		                              out_image + ofs,
+		                              rect, stride, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
+                                                  const ccl_global float *ccl_restrict image,
+                                                  ccl_global float *out_image,
+                                                  ccl_global float *accum_image,
+                                                  int w,
+                                                  int h,
+                                                  int stride,
+                                                  int shift_stride,
+                                                  int r,
+                                                  int f)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+		                                difference_image + ofs,
+		                                image,
+		                                out_image,
+		                                accum_image,
+		                                rect, stride, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+                                              const ccl_global float *ccl_restrict accum_image,
+                                              int w,
+                                              int h,
+                                              int stride)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < w && y < h) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
+                                                      const ccl_global float *ccl_restrict buffer,
+                                                      const ccl_global float *ccl_restrict transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 filter_window,
+                                                      int w,
+                                                      int h,
+                                                      int stride,
+                                                      int shift_stride,
+                                                      int r,
+                                                      int f,
+                                                      int pass_stride)
+{
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+		kernel_filter_nlm_construct_gramian(co.x, co.y,
+		                                    co.z, co.w,
+		                                    difference_image + ofs,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_window,
+		                                    stride, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, buffer, rank,
+		                       filter_area.z*filter_area.w,
+		                       XtWX, XtWY,
+		                       buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..9d5d784e140 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -16,45 +16,42 @@
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_image_opencl.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernels/opencl/kernel_opencl_image.h"
 
-#include "../../kernel_film.h"
+#include "kernel/kernel_film.h"
 
 #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__)
-#  include "../../kernel_path.h"
-#  include "../../kernel_path_branched.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 /* Include only actually used headers for the case
  * when path tracing kernels are not needed.
  */
-#  include "../../kernel_random.h"
-#  include "../../kernel_differential.h"
-#  include "../../kernel_montecarlo.h"
-#  include "../../kernel_projection.h"
-#  include "../../geom/geom.h"
-#  include "../../bvh/bvh.h"
-
-#  include "../../kernel_accumulate.h"
-#  include "../../kernel_camera.h"
-#  include "../../kernel_shader.h"
+#  include "kernel/kernel_random.h"
+#  include "kernel/kernel_differential.h"
+#  include "kernel/kernel_montecarlo.h"
+#  include "kernel/kernel_projection.h"
+#  include "kernel/geom/geom.h"
+#  include "kernel/bvh/bvh.h"
+
+#  include "kernel/kernel_accumulate.h"
+#  include "kernel/kernel_camera.h"
+#  include "kernel/kernel_shader.h"
 #endif  /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */
 
-#include "../../kernel_bake.h"
+#include "kernel/kernel_bake.h"
 
 #ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
 	ccl_global float *buffer,
-	ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -63,28 +60,24 @@ __kernel void kernel_ocl_path_trace(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_displace(
 	ccl_constant KernelData *data,
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
-	ccl_global float *output_luma,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -92,20 +85,35 @@ __kernel void kernel_ocl_shader(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
-		kernel_shader_evaluate(kg,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type,
-		                       x,
-		                       sample);
+		kernel_displace_evaluate(kg, input, output, x);
+	}
+}
+__kernel void kernel_ocl_background(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+	KERNEL_BUFFER_PARAMS,
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
+
+	int x = sx + ccl_global_id(0);
+
+	if(x < sx + sw) {
+		kernel_background_evaluate(kg, input, output, x);
 	}
 }
 
@@ -114,9 +122,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -124,11 +130,10 @@ __kernel void kernel_ocl_bake(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 #ifdef __NO_BAKING__
@@ -144,9 +149,7 @@ __kernel void kernel_ocl_convert_to_byte(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -155,12 +158,11 @@ __kernel void kernel_ocl_convert_to_byte(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -171,9 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -182,15 +182,30 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
+{
+	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	if(i < size / sizeof(float4)) {
+		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else if(i == size / sizeof(float4)) {
+		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+		for(i = 0; i < size % sizeof(float4); i++) {
+			*(b++) = 0;
+		}
+	}
+}
+
 #endif  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
deleted file mode 100644
index 1914d241eb1..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_background_buffer_update.h"
-
-__kernel void kernel_ocl_path_trace_background_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        ccl_global int *Queue_data,            /* Queues memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel. */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag =
-			kernel_background_buffer_update((KernelGlobals *)kg,
-			                                per_sample_output_buffers,
-			                                rng_state,
-			                                rng_coop,
-			                                throughput_coop,
-			                                PathRadiance_coop,
-			                                Ray_coop,
-			                                PathState_coop,
-			                                L_transparent_coop,
-			                                ray_state,
-			                                sw, sh, sx, sy, stride,
-			                                rng_state_offset_x,
-			                                rng_state_offset_y,
-			                                rng_state_stride,
-			                                work_array,
-			                                end_sample,
-			                                start_sample,
-#ifdef __WORK_STEALING__
-			                                work_pool_wgs,
-			                                num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-			                                debugdata_coop,
-#endif
-			                                parallel_samples,
-			                                ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	 * These rays will be made active during next SceneIntersectkernel.
-	 */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
new file mode 100644
index 00000000000..dcea2630aef
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..7125348a49f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,40 @@
  * limitations under the License.
  */
 
-#include "split/kernel_data_init.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *globals,
-        ccl_global char *sd_DL_shadow,
+        ccl_global char *kg,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "../../kernel_textures.h"
-
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+		KERNEL_BUFFER_PARAMS,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
         unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global float *buffer)
 {
-	kernel_data_init((KernelGlobals *)globals,
-	                 (ShaderData *)sd_DL_shadow,
+	kernel_data_init((KernelGlobals*)kg,
 	                 data,
-	                 per_sample_output_buffers,
-	                 rng_state,
-	                 rng_coop,
-	                 throughput_coop,
-	                 L_transparent_coop,
-	                 PathRadiance_coop,
-	                 Ray_coop,
-	                 PathState_coop,
-	                 Intersection_coop_shadow,
+	                 split_data_buffer,
+	                 num_elements,
 	                 ray_state,
-
-#define KERNEL_TEX(type, ttype, name) name,
-#include "../../kernel_textures.h"
-
-	                 start_sample, sx, sy, sw, sh, offset, stride,
-	                 rng_state_offset_x,
-	                 rng_state_offset_y,
-	                 rng_state_stride,
-	                 Queue_data,
+	                 KERNEL_BUFFER_ARGS,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
 	                 Queue_index,
 	                 queuesize,
 	                 use_queues_flag,
-	                 work_array,
-#ifdef __WORK_STEALING__
 	                 work_pool_wgs,
 	                 num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	                 debugdata_coop,
-#endif
-	                 parallel_samples);
+	                 buffer);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_direct_lighting.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_direct_lighting.h"
 
-__kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                    /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        ccl_global int *Queue_data,             /* Queue memory */
-        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-        int queuesize)                          /* Size (capacity) of each queue */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
-		                                      (ShaderData *)sd,
-		                                      rng_coop,
-		                                      PathState_coop,
-		                                      ISLamp_coop,
-		                                      LightRay_coop,
-		                                      BSDFEval_coop,
-		                                      ray_state,
-		                                      ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
new file mode 100644
index 00000000000..8afaa686e28
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_do_volume.h"
+
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif  /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		kernel_holdout_emission_blurring_pathtermination_ao(
-		        (KernelGlobals *)kg,
-		        (ShaderData *)sd,
-		        per_sample_output_buffers,
-		        rng_coop,
-		        throughput_coop,
-		        L_transparent_coop,
-		        PathRadiance_coop,
-		        PathState_coop,
-		        Intersection_coop,
-		        AOAlpha_coop,
-		        AOBSDF_coop,
-		        AOLightRay_coop,
-		        sw, sh, sx, sy, stride,
-		        ray_state,
-		        work_array,
-#ifdef __WORK_STEALING__
-		        start_sample,
-#endif
-		        parallel_samples,
-		        ray_index,
-		        &enqueue_flag,
-		        &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics_bg,
-	                        Queue_data,
-	                        Queue_index);
-
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
-	                        queuesize,
-	                        &local_queue_atomics_ao,
-	                        Queue_data,
-	                        Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
new file mode 100644
index 00000000000..192d01444ba
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_background.h"
+
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
new file mode 100644
index 00000000000..84938b889e5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_lamp_emission.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_lamp_emission.h"
 
-__kernel void kernel_ocl_path_trace_lamp_emission(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	/* We will empty this queue in this kernel. */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-	/* Fetch use_queues_flag. */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_lamp_emission((KernelGlobals *)kg,
-	                     throughput_coop,
-	                     PathRadiance_coop,
-	                     Ray_coop,
-	                     PathState_coop,
-	                     Intersection_coop,
-	                     ray_state,
-	                     sw, sh,
-	                     use_queues_flag,
-	                     ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_next_iteration_setup.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
 
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                  /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global int *Queue_data,           /* Queue memory */
-        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
-        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-		 * previous kernel.
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
-		                                           (ShaderData *)sd,
-		                                           rng_coop,
-		                                           throughput_coop,
-		                                           PathRadiance_coop,
-		                                           Ray_coop,
-		                                           PathState_coop,
-		                                           LightRay_dl_coop,
-		                                           ISLamp_coop,
-		                                           BSDFEval_coop,
-		                                           LightRay_ao_coop,
-		                                           AOBSDF_coop,
-		                                           AOAlpha_coop,
-		                                           ray_state,
-		                                           use_queues_flag,
-		                                           ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
new file mode 100644
index 00000000000..faa9dd66d0e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* For OpenCL we do manual lookup and interpolation. */
+
+ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) {
+	const uint tex_offset = id
+#define KERNEL_TEX(type, name) + 1
+#include "kernel/kernel_textures.h"
+	;
+
+	return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset];
+}
+
+#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+	x %= width;
+	if(x < 0)
+		x += width;
+	return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+	return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset)
+{
+	const int texture_type = kernel_tex_type(id);
+
+	/* Float4 */
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
+		return tex_fetch(float4, info, offset);
+	}
+	/* Byte4 */
+	else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
+		uchar4 r = tex_fetch(uchar4, info, offset);
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+	/* Float */
+	else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
+		float f = tex_fetch(float, info, offset);
+		return make_float4(f, f, f, 1.0f);
+	}
+	/* Byte */
+	else {
+		uchar r = tex_fetch(uchar, info, offset);
+		float f = r * (1.0f/255.0f);
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	/* Wrap */
+	if(info->extension == EXTENSION_REPEAT) {
+		x = svm_image_texture_wrap_periodic(x, info->width);
+		y = svm_image_texture_wrap_periodic(y, info->height);
+	}
+	else {
+		x = svm_image_texture_wrap_clamp(x, info->width);
+		y = svm_image_texture_wrap_clamp(y, info->height);
+	}
+
+	int offset = x + info->width * y;
+	return svm_image_texture_read(kg, info, id, offset);
+}
+
+ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	/* Wrap */
+	if(info->extension == EXTENSION_REPEAT) {
+		x = svm_image_texture_wrap_periodic(x, info->width);
+		y = svm_image_texture_wrap_periodic(y, info->height);
+		z = svm_image_texture_wrap_periodic(z, info->depth);
+	}
+	else {
+		x = svm_image_texture_wrap_clamp(x, info->width);
+		y = svm_image_texture_wrap_clamp(y, info->height);
+		z = svm_image_texture_wrap_clamp(z, info->depth);
+	}
+
+	int offset = x + info->width * y + info->width * info->height * z;
+	return svm_image_texture_read(kg, info, id, offset);
+}
+
+
+ccl_device_inline float svm_image_texture_frac(float x, int *ix)
+{
+	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+	*ix = i;
+	return x - (float)i;
+}
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	if(info->extension == EXTENSION_CLIP) {
+		if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+	}
+
+	if(info->interpolation == INTERPOLATION_CLOSEST) {
+		/* Closest interpolation. */
+		int ix, iy;
+		svm_image_texture_frac(x*info->width, &ix);
+		svm_image_texture_frac(y*info->height, &iy);
+
+		return svm_image_texture_read_2d(kg, id, ix, iy);
+	}
+	else if(info->interpolation == INTERPOLATION_LINEAR) {
+		/* Bilinear interpolation. */
+		int ix, iy;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+		float4 r;
+		r =  (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy);
+		r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy);
+		r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1);
+		r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1);
+		return r;
+	}
+	else {
+		/* Bicubic interpolation. */
+		int ix, iy;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+		float u[4], v[4];
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+		float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		for(int y = 0; y < 4; y++) {
+			for(int x = 0; x < 4; x++) {
+				float weight = u[x]*v[y];
+				r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1);
+			}
+		}
+		return r;
+	}
+}
+
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	if(info->extension == EXTENSION_CLIP) {
+		if(x < 0.0f || y < 0.0f || z < 0.0f ||
+		   x > 1.0f || y > 1.0f || z > 1.0f)
+		{
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+	}
+
+	uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
+
+	if(interpolation == INTERPOLATION_CLOSEST) {
+		/* Closest interpolation. */
+		int ix, iy, iz;
+		svm_image_texture_frac(x*info->width, &ix);
+		svm_image_texture_frac(y*info->height, &iy);
+		svm_image_texture_frac(z*info->depth, &iz);
+
+		return svm_image_texture_read_3d(kg, id, ix, iy, iz);
+	}
+	else if(interpolation == INTERPOLATION_LINEAR) {
+		/* Bilinear interpolation. */
+		int ix, iy, iz;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+		float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
+
+		float4 r;
+		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz);
+		r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz);
+		r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz);
+		r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz);
+
+		r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1);
+		r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1);
+		r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1);
+		r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1);
+		return r;
+	}
+	else {
+		/* Bicubic interpolation. */
+		int ix, iy, iz;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+		float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
+
+		float u[4], v[4], w[4];
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+		SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+		float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		for(int z = 0; z < 4; z++) {
+			for(int y = 0; y < 4; y++) {
+				for(int x = 0; x < 4; x++) {
+					float weight = u[x]*v[y]*w[z];
+					r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1);
+				}
+			}
+		}
+		return r;
+	}
+}
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..fa210e747c0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_path_init.h"
+
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,13 @@
  * limitations under the License.
  */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_queue_enqueue.h"
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-__kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global int *Queue_data,   /* Queue memory */
-        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-        ccl_global char *ray_state,   /* Denotes the state of each ray */
-        int queuesize)                /* Size (capacity) of each queue */
-{
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	}
-	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
-		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
-		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number,
-		                                  queuesize,
-		                                  my_lqidx,
-		                                  local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_scene_intersect.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_scene_intersect.h"
 
-__kernel void kernel_ocl_path_trace_scene_intersect(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_scene_intersect((KernelGlobals *)kg,
-	                       rng_coop,
-	                       Ray_coop,
-	                       PathState_coop,
-	                       Intersection_coop,
-	                       ray_state,
-	                       sw, sh,
-	                       use_queues_flag,
-#ifdef __KERNEL_DEBUG__
-	                       debugdata_coop,
-#endif
-	                       ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_shader_eval.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_eval.h"
 
-__kernel void kernel_ocl_path_trace_shader_eval(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global int *Queue_data,            /* queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-
-	/* Continue on with shader evaluation. */
-	kernel_shader_eval((KernelGlobals *)kg,
-	                   (ShaderData *)sd,
-	                   rng_coop,
-	                   Ray_coop,
-	                   PathState_coop,
-	                   Intersection_coop,
-	                   ray_state,
-	                   ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..8c36100f762
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..bcacaa4a054
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
deleted file mode 100644
index edf76fba714..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_shadow_blocked.h"
-
-__kernel void kernel_ocl_path_trace_shadow_blocked(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	kernel_shadow_blocked((KernelGlobals *)kg,
-	                      PathState_coop,
-	                      LightRay_dl_coop,
-	                      LightRay_ao_coop,
-	                      ray_state,
-	                      shadow_blocked_type,
-	                      ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
new file mode 100644
index 00000000000..8de250a375c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
new file mode 100644
index 00000000000..29da77022ed
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
new file mode 100644
index 00000000000..4cbda1bc2e7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
+#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
+
+#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
+#include "kernel/kernels/opencl/kernel_data_init.cl"
+#include "kernel/kernels/opencl/kernel_path_init.cl"
+
+#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
+#include "kernel/kernels/opencl/kernel_lamp_emission.cl"
+#include "kernel/kernels/opencl/kernel_do_volume.cl"
+#include "kernel/kernels/opencl/kernel_indirect_background.cl"
+#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
+#include "kernel/kernels/opencl/kernel_shader_eval.cl"
+#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
+#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
+#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
+#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
+#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
+#include "kernel/kernels/opencl/kernel_buffer_update.cl"
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..6aa7681cbed
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+		ccl_global char *kg_global,
+		ccl_constant KernelData *data,
+
+		ccl_global void *split_data_buffer,
+		ccl_global char *ray_state,
+
+		KERNEL_BUFFER_PARAMS,
+
+		ccl_global int *queue_index,
+		ccl_global char *use_queues_flag,
+		ccl_global unsigned int *work_pools,
+		ccl_global float *buffer
+	)
+{
+#ifdef LOCALS_TYPE
+	ccl_local LOCALS_TYPE locals;
+#endif
+
+	KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+	if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+		kg->data = data;
+
+		kernel_split_params.queue_index = queue_index;
+		kernel_split_params.use_queues_flag = use_queues_flag;
+		kernel_split_params.work_pools = work_pools;
+		kernel_split_params.tile.buffer = buffer;
+
+		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+	}
+
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+
+	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+			kg
+#ifdef LOCALS_TYPE
+			, &locals
+#endif
+		);
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..c10ecc426c6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+        ccl_global char *kg,
+        ccl_constant KernelData *data,
+        uint num_threads,
+        ccl_global uint64_t *size)
+{
+	((KernelGlobals*)kg)->data = data;
+	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
new file mode 100644
index 00000000000..2b3be38df84
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	kernel_sum_all_radiance(data,
-	                        buffer,
-	                        per_sample_output_buffer,
-	                        parallel_samples,
-	                        sw, sh, stride,
-	                        buffer_offset_x,
-	                        buffer_offset_y,
-	                        buffer_stride,
-	                        start_sample);
-}
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 98de40e5a8a..159de63a044 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	..
-	../svm
-	../../graph
-	../../render
-	../../util
-	../../device
+	../..
 )
 
 set(INC_SYS
@@ -36,5 +30,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_kernel_osl ${SRC} ${HEADER_SRC})
+cycles_add_library(cycles_kernel_osl ${SRC} ${HEADER_SRC})
 
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index d835f9be45c..8fff19407d9 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -34,10 +34,11 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "closure/alloc.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +54,7 @@ class GenericBackgroundClosure : public CClosurePrimitive {
 public:
 	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
 	{
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, weight);
+		background_setup(sd, weight);
 	}
 };
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index bc26f42b559..ea18f2c8c86 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,13 +34,13 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 14c7644936e..a26671eb09e 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_phong_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 3f13e08b302..6162786b527 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/emissive.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -56,8 +56,7 @@ class GenericEmissiveClosure : public CClosurePrimitive {
 public:
 	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
 	{
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, weight);
-		sd->flag |= SD_EMISSION;
+		emission_setup(sd, weight);
 	}
 };
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 3614717e28c..da7368bbc61 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,153 +32,104 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
 
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bssrdf.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
 
 CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
+static ustring u_cubic("cubic");
+static ustring u_gaussian("gaussian");
+static ustring u_burley("burley");
+static ustring u_principled("principled");
+static ustring u_random_walk("random_walk");
+static ustring u_principled_random_walk("principled_random_walk");
+
 class CBSSRDFClosure : public CClosurePrimitive {
 public:
 	Bssrdf params;
-	float3 radius;
-	float3 albedo;
+	ustring method;
 
-	void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
+	CBSSRDFClosure()
 	{
-		float sample_weight = fabsf(average(weight));
-
-		/* disable in case of diffuse ancestor, can't see it well then and
-		 * adds considerably noise due to probabilities of continuing path
-		 * getting lower and lower */
-		if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
-			radius = make_float3(0.0f, 0.0f, 0.0f);
-		}
-
-		if(sample_weight > CLOSURE_WEIGHT_CUTOFF) {
-			/* sharpness */
-			float sharpness = params.sharpness;
-			/* texture color blur */
-			float texture_blur = params.texture_blur;
-
-			/* create one closure per color channel */
-			Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f));
-			if(bssrdf) {
-				bssrdf->sample_weight = sample_weight;
-				bssrdf->radius = radius.x;
-				bssrdf->texture_blur = texture_blur;
-				bssrdf->albedo = albedo.x;
-				bssrdf->sharpness = sharpness;
-				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-			}
-
-			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
-			if(bssrdf) {
-				bssrdf->sample_weight = sample_weight;
-				bssrdf->radius = radius.y;
-				bssrdf->texture_blur = texture_blur;
-				bssrdf->albedo = albedo.y;
-				bssrdf->sharpness = sharpness;
-				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-			}
-
-			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
-			if(bssrdf) {
-				bssrdf->sample_weight = sample_weight;
-				bssrdf->radius = radius.z;
-				bssrdf->texture_blur = texture_blur;
-				bssrdf->albedo = albedo.z;
-				bssrdf->sharpness = sharpness;
-				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-			}
-		}
+		params.texture_blur = 0.0f;
+		params.sharpness = 0.0f;
+		params.roughness = 0.0f;
 	}
-};
-
-/* Cubic */
 
-class CubicBSSRDFClosure : public CBSSRDFClosure {
-public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
+		if (method == u_cubic) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
+		}
+		else if (method == u_gaussian) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
+		}
+		else if (method == u_burley) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
+		}
+		else if (method == u_principled) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+		}
+		else if (method == u_random_walk) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
+		}
+		else if (method == u_principled_random_walk) {
+			alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+		}
 	}
-};
-
-ClosureParam *closure_bssrdf_cubic_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, params.N),
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.texture_blur),
-		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.sharpness),
-		CLOSURE_STRING_KEYPARAM(CubicBSSRDFClosure, label, "label"),
-		CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
-	};
-	return params;
-}
 
-CCLOSURE_PREPARE(closure_bssrdf_cubic_prepare, CubicBSSRDFClosure)
-
-/* Gaussian */
-
-class GaussianBSSRDFClosure : public CBSSRDFClosure {
-public:
-	void setup(ShaderData *sd, int path_flag, float3 weight)
+	void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
 	{
-		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
-	}
-};
-
-ClosureParam *closure_bssrdf_gaussian_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, params.N),
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, params.texture_blur),
-		CLOSURE_STRING_KEYPARAM(GaussianBSSRDFClosure, label, "label"),
-		CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
-	};
-	return params;
-}
-
-CCLOSURE_PREPARE(closure_bssrdf_gaussian_prepare, GaussianBSSRDFClosure)
-
-/* Burley */
+		Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
+
+		if(bssrdf) {
+			/* disable in case of diffuse ancestor, can't see it well then and
+			 * adds considerably noise due to probabilities of continuing path
+			 * getting lower and lower */
+			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+				params.radius = make_float3(0.0f, 0.0f, 0.0f);
+			}
 
-class BurleyBSSRDFClosure : public CBSSRDFClosure {
-public:
-	void setup(ShaderData *sd, int path_flag, float3 weight)
-	{
-		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
+			/* create one closure per color channel */
+			bssrdf->radius = params.radius;
+			bssrdf->albedo = params.albedo;
+			bssrdf->texture_blur = params.texture_blur;
+			bssrdf->sharpness = params.sharpness;
+			bssrdf->N = params.N;
+			bssrdf->roughness = params.roughness;
+			sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+		}
 	}
 };
 
-ClosureParam *closure_bssrdf_burley_params()
+ClosureParam *closure_bssrdf_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, params.N),
-		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(BurleyBSSRDFClosure, params.texture_blur),
-		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, albedo),
-		CLOSURE_STRING_KEYPARAM(BurleyBSSRDFClosure, label, "label"),
-		CLOSURE_FINISH_PARAM(BurleyBSSRDFClosure)
+		CLOSURE_STRING_PARAM(CBSSRDFClosure, method),
+		CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
+		CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
+		CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
+		CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
+		CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+		CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(CBSSRDFClosure)
 	};
 	return params;
 }
 
-CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure)
+CCLOSURE_PREPARE(closure_bssrdf_prepare, CBSSRDFClosure)
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 94de782dca0..ee16ddaf0fd 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -33,33 +33,35 @@
 #include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 
-#include "osl_closures.h"
-#include "osl_shader.h"
-
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_param.h"
-
-#include "kernel_types.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_montecarlo.h"
-#include "kernel_random.h"
-
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_ashikhmin_velvet.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_microfacet.h"
-#include "closure/bsdf_microfacet_multi.h"
-#include "closure/bsdf_oren_nayar.h"
-#include "closure/bsdf_reflection.h"
-#include "closure/bsdf_refraction.h"
-#include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_toon.h"
-#include "closure/bsdf_hair.h"
-#include "closure/volume.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_math.h"
+#include "util/util_param.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -89,9 +91,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, MicrofacetBsdf, LABEL_SINGULAR)
 	CLOSURE_FLOAT_PARAM(RefractionClosure, params.ior),
 BSDF_CLOSURE_CLASS_END(Refraction, refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, ShaderClosure, LABEL_SINGULAR)
-BSDF_CLOSURE_CLASS_END(Transparent, transparent)
-
 BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, VelvetBsdf, LABEL_DIFFUSE)
 	CLOSURE_FLOAT3_PARAM(AshikhminVelvetClosure, params.N),
 	CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, params.sigma),
@@ -153,7 +152,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -161,19 +160,75 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset),
 BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission)
 
-VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, HenyeyGreensteinVolume, LABEL_VOLUME_SCATTER)
-	CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g),
-VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N),
+	CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
+BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
+
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+
+/* DISNEY PRINCIPLED CLEARCOAT */
+class PrincipledClearcoatClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float clearcoat, clearcoat_roughness;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		if(!bsdf) {
+			return NULL;
+		}
+
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(!extra) {
+			return NULL;
+		}
+
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->extra = extra;
+		bsdf->ior = 1.5f;
+		bsdf->alpha_x = clearcoat_roughness;
+		bsdf->alpha_y = clearcoat_roughness;
+		bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+		bsdf->extra->clearcoat = clearcoat;
+		return bsdf;
+	}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_principled_clearcoat_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
+		CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
 
-VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
-VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
 
 /* Registration */
 
@@ -182,7 +237,11 @@ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, O
 	/* optimization: it's possible to not use a prepare function at all and
 	 * only initialize the actual class when accessing the closure component
 	 * data, but then we need to map the id to the class somehow */
+#if OSL_LIBRARY_VERSION_CODE >= 10900
+	ss->register_closure(name, id, params, prepare, NULL);
+#else
 	ss->register_closure(name, id, params, prepare, NULL, 16);
+#endif
 }
 
 void OSLShader::register_closures(OSLShadingSystem *ss_)
@@ -201,7 +260,7 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 	register_closure(ss, "refraction", id++,
 		bsdf_refraction_params(), bsdf_refraction_prepare);
 	register_closure(ss, "transparent", id++,
-		bsdf_transparent_params(), bsdf_transparent_prepare);
+		closure_bsdf_transparent_params(), closure_bsdf_transparent_prepare);
 	register_closure(ss, "microfacet_ggx", id++,
 		bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
 	register_closure(ss, "microfacet_ggx_aniso", id++,
@@ -214,6 +273,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare);
 	register_closure(ss, "microfacet_multi_ggx_aniso", id++,
 		closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare);
+	register_closure(ss, "microfacet_ggx_fresnel", id++,
+		closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
 	register_closure(ss, "microfacet_beckmann", id++,
 		bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
 	register_closure(ss, "microfacet_beckmann_aniso", id++,
@@ -228,6 +297,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
+	register_closure(ss, "principled_diffuse", id++,
+		bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare);
+	register_closure(ss, "principled_sheen", id++,
+		bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+	register_closure(ss, "principled_clearcoat", id++,
+		closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
@@ -241,12 +316,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bsdf_diffuse_ramp_params(), closure_bsdf_diffuse_ramp_prepare);
 	register_closure(ss, "phong_ramp", id++,
 		closure_bsdf_phong_ramp_params(), closure_bsdf_phong_ramp_prepare);
-	register_closure(ss, "bssrdf_cubic", id++,
-		closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare);
-	register_closure(ss, "bssrdf_gaussian", id++,
-		closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
-	register_closure(ss, "bssrdf_burley", id++,
-		closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare);
+	register_closure(ss, "bssrdf", id++,
+		closure_bssrdf_params(), closure_bssrdf_prepare);
 
 	register_closure(ss, "hair_reflection", id++,
 		bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
@@ -254,9 +325,9 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_hair_transmission_params(), bsdf_hair_transmission_prepare);
 
 	register_closure(ss, "henyey_greenstein", id++,
-		volume_henyey_greenstein_params(), volume_henyey_greenstein_prepare);
+		closure_henyey_greenstein_params(), closure_henyey_greenstein_prepare);
 	register_closure(ss, "absorption", id++,
-		volume_absorption_params(), volume_absorption_prepare);
+		closure_absorption_params(), closure_absorption_prepare);
 }
 
 /* BSDF Closure */
@@ -277,6 +348,103 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 	return false;
 }
 
+
+/* GGX closures with Fresnel */
+
+class MicrofacetFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			return NULL;
+		}
+
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		if(!bsdf) {
+			return NULL;
+		}
+
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(!extra) {
+			return NULL;
+		}
+
+		bsdf->extra = extra;
+		bsdf->extra->color = color;
+		bsdf->extra->cspec0 = cspec0;
+		bsdf->extra->clearcoat = 0.0f;
+		return bsdf;
+	}
+};
+
+class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->alpha_y = bsdf->alpha_x;
+		sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
+
+class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure);
+
+
 /* Multiscattering GGX closures */
 
 class MicrofacetMultiClosure : public CBSDFClosure {
@@ -286,20 +454,28 @@ public:
 
 	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
 	{
-		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		/* Technically, the MultiGGX closure may also transmit. However,
 		 * since this is set statically and only used for caustic flags, this
 		 * is probably as good as it gets. */
-	    if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
-			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
-			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-			if(bsdf && extra) {
-				bsdf->extra = extra;
-				bsdf->extra->color = color;
-				return bsdf;
-			}
+	    if(skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
+			return NULL;
+		}
+
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		if(!bsdf) {
+			return NULL;
+		}
+
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(!extra) {
+			return NULL;
 		}
 
-		return NULL;
+		bsdf->extra = extra;
+		bsdf->extra->color = color;
+		bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->extra->clearcoat = 0.0f;
+		return bsdf;
 	}
 };
 
@@ -308,7 +484,14 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_setup(bsdf) : 0;
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->ior = 0.0f;
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->alpha_y = bsdf->alpha_x;
+		sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 	}
 };
 
@@ -330,7 +513,12 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_setup(bsdf) : 0;
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->ior = 0.0f;
+		sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 	}
 };
 
@@ -356,7 +544,13 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_setup(bsdf) : 0;
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->alpha_y = bsdf->alpha_x;
+		sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 	}
 };
 
@@ -374,5 +568,208 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
 }
 CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
 
+
+/* Multiscattering GGX closures with Fresnel */
+
+class MicrofacetMultiFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			return NULL;
+		}
+
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		if(!bsdf) {
+			return NULL;
+		}
+
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(!extra) {
+			return NULL;
+		}
+
+		bsdf->extra = extra;
+		bsdf->extra->color = color;
+		bsdf->extra->cspec0 = cspec0;
+		bsdf->extra->clearcoat = 0.0f;
+		return bsdf;
+	}
+};
+
+class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->alpha_y = bsdf->alpha_x;
+		sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure);
+
+class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure);
+
+class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		if(!bsdf) {
+			return;
+		}
+
+		bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+		bsdf->alpha_y = bsdf->alpha_x;
+		sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure);
+
+/* Transparent */
+
+class TransparentClosure : public CBSDFClosure {
+public:
+	ShaderClosure params;
+	float3 unused;
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		bsdf_transparent_setup(sd, weight, path_flag);
+	}
+};
+
+ClosureParam *closure_bsdf_transparent_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_STRING_KEYPARAM(TransparentClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(TransparentClosure)
+	};
+	return params;
+}
+
+CCLOSURE_PREPARE(closure_bsdf_transparent_prepare, TransparentClosure)
+
+/* Volume */
+
+class VolumeAbsorptionClosure : public CBSDFClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		volume_extinction_setup(sd, weight);
+	}
+};
+
+ClosureParam *closure_absorption_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_STRING_KEYPARAM(VolumeAbsorptionClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(VolumeAbsorptionClosure)
+	};
+	return params;
+}
+
+CCLOSURE_PREPARE(closure_absorption_prepare, VolumeAbsorptionClosure)
+
+class VolumeHenyeyGreensteinClosure : public CBSDFClosure {
+public:
+	HenyeyGreensteinVolume params;
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		volume_extinction_setup(sd, weight);
+
+	    HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc_osl(sd, sizeof(HenyeyGreensteinVolume), weight, &params);
+		if(!volume) {
+			return;
+		}
+
+		sd->flag |= volume_henyey_greenstein_setup(volume);
+	}
+};
+
+ClosureParam *closure_henyey_greenstein_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g),
+		CLOSURE_STRING_KEYPARAM(VolumeHenyeyGreensteinClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(VolumeHenyeyGreensteinClosure)
+	};
+	return params;
+}
+
+CCLOSURE_PREPARE(closure_henyey_greenstein_prepare, VolumeHenyeyGreensteinClosure)
+
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index cd7b33703ff..dca7e74f154 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,8 +33,8 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util_types.h"
-#include "kernel_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
@@ -48,13 +48,19 @@ OSL::ClosureParam *closure_holdout_params();
 OSL::ClosureParam *closure_ambient_occlusion_params();
 OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
 OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_bssrdf_cubic_params();
-OSL::ClosureParam *closure_bssrdf_gaussian_params();
-OSL::ClosureParam *closure_bssrdf_burley_params();
-OSL::ClosureParam *closure_henyey_greenstein_volume_params();
+OSL::ClosureParam *closure_bsdf_transparent_params();
+OSL::ClosureParam *closure_bssrdf_params();
+OSL::ClosureParam *closure_absorption_params();
+OSL::ClosureParam *closure_henyey_greenstein_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
 
 void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
 void closure_background_prepare(OSL::RendererServices *, int id, void *data);
@@ -62,13 +68,19 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
 void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data);
-void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data);
+void closure_absorption_prepare(OSL::RendererServices *, int id, void *data);
+void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
 
 #define CCLOSURE_PREPARE(name, classname)          \
 void name(RendererServices *, int id, void *data) \
@@ -133,36 +145,6 @@ static ClosureParam *bsdf_##lower##_params() \
 \
 CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure)
 
-/* Volume */
-
-#define VOLUME_CLOSURE_CLASS_BEGIN(Upper, lower, structname, TYPE) \
-\
-class Upper##Closure : public CBSDFClosure { \
-public: \
-	structname params; \
-\
-	void setup(ShaderData *sd, int path_flag, float3 weight) \
-	{ \
-	    structname *volume = (structname*)bsdf_alloc_osl(sd, sizeof(structname), weight, &params); \
-		sd->flag |= (volume) ? volume_##lower##_setup(volume) : 0; \
-	} \
-}; \
-\
-static ClosureParam *volume_##lower##_params() \
-{ \
-	static ClosureParam params[] = {
-
-/* parameters */
-
-#define VOLUME_CLOSURE_CLASS_END(Upper, lower) \
-		CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \
-		CLOSURE_FINISH_PARAM(Upper##Closure) \
-	}; \
-	return params; \
-} \
-\
-CCLOSURE_PREPARE_STATIC(volume_##lower##_prepare, Upper##Closure)
-
 CCL_NAMESPACE_END
 
 #endif /* __OSL_CLOSURES_H__ */
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 65cb7ecc6b4..9585d9f4825 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -21,10 +21,10 @@
 
 #include <OSL/oslexec.h>
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 #ifndef WIN32
 using std::isfinite;
@@ -86,7 +86,7 @@ struct OSLThreadData {
 	OSL::ShaderGlobals globals;
 	OSL::PerThreadInfo *osl_thread_info;
 	OSLTraceData tracedata;
-	OSL::ShadingContext *context[SHADER_CONTEXT_NUM];
+	OSL::ShadingContext *context;
 	OIIO::TextureSystem::Perthread *oiio_thread_info;
 };
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58bbdc33920..0c5e5e30e47 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -25,33 +25,34 @@
 
 #include <string.h>
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_string.h"
-
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_differential.h"
-#include "kernel_montecarlo.h"
-#include "kernel_camera.h"
-#include "kernels/cpu/kernel_cpu_image.h"
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_projection.h"
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernels/cpu/kernel_cpu_image.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
 
 #ifdef WITH_PTEX
 #  include <Ptexture.h>
@@ -61,11 +62,17 @@ CCL_NAMESPACE_BEGIN
 
 /* RenderServices implementation */
 
-#define COPY_MATRIX44(m1, m2)  { \
-	CHECK_TYPE(m1, OSL::Matrix44*); \
-	CHECK_TYPE(m2, Transform*); \
-	memcpy(m1, m2, sizeof(*m2)); \
-} (void)0
+static void copy_matrix(OSL::Matrix44& m, const Transform& tfm)
+{
+	ProjectionTransform t = projection_transpose(ProjectionTransform(tfm));
+	memcpy(&m, &t, sizeof(m));
+}
+
+static void copy_matrix(OSL::Matrix44& m, const ProjectionTransform& tfm)
+{
+	ProjectionTransform t = projection_transpose(tfm);
+	memcpy(&m, &t, sizeof(m));
+}
 
 /* static ustrings */
 ustring OSLRenderServices::u_distance("distance");
@@ -82,6 +89,7 @@ ustring OSLRenderServices::u_geom_dupli_uv("geom:dupli_uv");
 ustring OSLRenderServices::u_material_index("material:index");
 ustring OSLRenderServices::u_object_random("object:random");
 ustring OSLRenderServices::u_particle_index("particle:index");
+ustring OSLRenderServices::u_particle_random("particle:random");
 ustring OSLRenderServices::u_particle_age("particle:age");
 ustring OSLRenderServices::u_particle_lifetime("particle:lifetime");
 ustring OSLRenderServices::u_particle_location("particle:location");
@@ -95,11 +103,10 @@ ustring OSLRenderServices::u_geom_polyvertices("geom:polyvertices");
 ustring OSLRenderServices::u_geom_name("geom:name");
 ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced");
 ustring OSLRenderServices::u_is_smooth("geom:is_smooth");
-#ifdef __HAIR__
 ustring OSLRenderServices::u_is_curve("geom:is_curve");
 ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness");
 ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
-#endif
+ustring OSLRenderServices::u_curve_random("geom:curve_random");
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
 ustring OSLRenderServices::u_path_diffuse_depth("path:diffuse_depth");
@@ -116,6 +123,7 @@ ustring OSLRenderServices::u_I("I");
 ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;
+ustring OSLRenderServices::u_at_bevel("@bevel");
 
 OSLRenderServices::OSLRenderServices()
 {
@@ -165,14 +173,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 #else
 			Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
 #endif
-			tfm = transform_transpose(tfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, tfm);
 
 			return true;
 		}
 		else if(sd->type == PRIMITIVE_LAMP) {
-			Transform tfm = transform_transpose(sd->ob_tfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, sd->ob_tfm);
 
 			return true;
 		}
@@ -201,14 +207,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 #else
 			Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 #endif
-			itfm = transform_transpose(itfm);
-			COPY_MATRIX44(&result, &itfm);
+			copy_matrix(result, itfm);
 
 			return true;
 		}
 		else if(sd->type == PRIMITIVE_LAMP) {
-			Transform tfm = transform_transpose(sd->ob_itfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, sd->ob_itfm);
 
 			return true;
 		}
@@ -222,23 +226,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 	KernelGlobals *kg = kernel_globals;
 
 	if(from == u_ndc) {
-		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.ndctoworld);
 		return true;
 	}
 	else if(from == u_raster) {
-		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.rastertoworld);
 		return true;
 	}
 	else if(from == u_screen) {
-		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.screentoworld);
 		return true;
 	}
 	else if(from == u_camera) {
-		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.cameratoworld);
 		return true;
 	}
 	else if(from == u_world) {
@@ -254,23 +254,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 	KernelGlobals *kg = kernel_globals;
 
 	if(to == u_ndc) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtondc);
 		return true;
 	}
 	else if(to == u_raster) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtoraster);
 		return true;
 	}
 	else if(to == u_screen) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtoscreen);
 		return true;
 	}
 	else if(to == u_camera) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtocamera);
 		return true;
 	}
 	else if(to == u_world) {
@@ -296,14 +292,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 			KernelGlobals *kg = sd->osl_globals;
 			Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
 #endif
-			tfm = transform_transpose(tfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, tfm);
 
 			return true;
 		}
 		else if(sd->type == PRIMITIVE_LAMP) {
-			Transform tfm = transform_transpose(sd->ob_tfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, sd->ob_tfm);
 
 			return true;
 		}
@@ -327,14 +321,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 			KernelGlobals *kg = sd->osl_globals;
 			Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 #endif
-			tfm = transform_transpose(tfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, tfm);
 
 			return true;
 		}
 		else if(sd->type == PRIMITIVE_LAMP) {
-			Transform tfm = transform_transpose(sd->ob_itfm);
-			COPY_MATRIX44(&result, &tfm);
+			copy_matrix(result, sd->ob_itfm);
 
 			return true;
 		}
@@ -348,23 +340,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 	KernelGlobals *kg = kernel_globals;
 
 	if(from == u_ndc) {
-		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.ndctoworld);
 		return true;
 	}
 	else if(from == u_raster) {
-		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.rastertoworld);
 		return true;
 	}
 	else if(from == u_screen) {
-		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.screentoworld);
 		return true;
 	}
 	else if(from == u_camera) {
-		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.cameratoworld);
 		return true;
 	}
 
@@ -376,23 +364,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 	KernelGlobals *kg = kernel_globals;
 	
 	if(to == u_ndc) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtondc);
 		return true;
 	}
 	else if(to == u_raster) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtoraster);
 		return true;
 	}
 	else if(to == u_screen) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtoscreen);
 		return true;
 	}
 	else if(to == u_camera) {
-		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
-		COPY_MATRIX44(&result, &tfm);
+		copy_matrix(result, kernel_data.cam.worldtocamera);
 		return true;
 	}
 	
@@ -568,8 +552,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives,
 static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val)
 {
 	if(type == TypeDesc::TypeMatrix) {
-		Transform transpose = transform_transpose(tfm);
-		memcpy(val, &transpose, sizeof(Transform));
+		copy_matrix(*(OSL::Matrix44*)val, tfm);
 		return true;
 	}
 
@@ -656,6 +639,12 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		float f = particle_index(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
+	else if(name == u_particle_random) {
+		int particle_id = object_particle_id(kg, sd->object);
+		float f = hash_int_01(particle_index(kg, particle_id));
+		return set_attribute_float(f, type, derivatives, val);
+	}
+
 	else if(name == u_particle_age) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_age(kg, particle_id);
@@ -699,11 +688,7 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		return set_attribute_int(3, type, derivatives, val);
 	}
 	else if((name == u_geom_trianglevertices || name == u_geom_polyvertices)
-#ifdef __HAIR__
 		     && sd->type & PRIMITIVE_ALL_TRIANGLE)
-#else
-		)
-#endif
 	{
 		float3 P[3];
 
@@ -728,7 +713,6 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-#ifdef __HAIR__
 	/* Hair Attributes */
 	else if(name == u_is_curve) {
 		float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
@@ -742,7 +726,6 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		float3 f = curve_tangent_normal(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-#endif
 	else
 		return false;
 }
@@ -823,7 +806,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if(sg->renderstate == NULL)
+	if(sg == NULL || sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -957,20 +940,36 @@ bool OSLRenderServices::texture(ustring filename,
 		return true;
 	}
 #endif
-	bool status;
+	bool status = false;
 
 	if(filename.length() && filename[0] == '@') {
-		int slot = atoi(filename.c_str() + 1);
-		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
-
-		result[0] = rgba[0];
-		if(nchannels > 1)
-			result[1] = rgba[1];
-		if(nchannels > 2)
-			result[2] = rgba[2];
-		if(nchannels > 3)
-			result[3] = rgba[3];
-		status = true;
+		if(filename == u_at_bevel) {
+			/* Bevel shader hack. */
+			if(nchannels >= 3) {
+				PathState *state = sd->osl_path_state;
+				int num_samples = (int)s;
+				float radius = t;
+				float3 N = svm_bevel(kg, sd, state, radius, num_samples);
+				result[0] = N.x;
+				result[1] = N.y;
+				result[2] = N.z;
+				status = true;
+			}
+		}
+		else {
+			/* Packed texture. */
+			int slot = atoi(filename.c_str() + 1);
+			float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t);
+
+			result[0] = rgba[0];
+			if(nchannels > 1)
+				result[1] = rgba[1];
+			if(nchannels > 2)
+				result[2] = rgba[2];
+			if(nchannels > 3)
+				result[3] = rgba[3];
+			status = true;
+		}
 	}
 	else {
 		if(texture_handle != NULL) {
@@ -1042,7 +1041,7 @@ bool OSLRenderServices::texture3d(ustring filename,
 	bool status;
 	if(filename.length() && filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
-		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
+		float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z, INTERPOLATION_NONE);
 
 		result[0] = rgba[0];
 		if(nchannels > 1)
@@ -1196,8 +1195,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->init = true;
 	tracedata->sd.osl_globals = sd->osl_globals;
 
-	/* raytrace */
-	return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+	/* Raytrace, leaving out shadow opaque to avoid early exit. */
+	uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
+	return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f);
 }
 
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index ec34ca77115..d96048e26f2 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -147,6 +147,7 @@ public:
 	static ustring u_material_index;
 	static ustring u_object_random;
 	static ustring u_particle_index;
+	static ustring u_particle_random;
 	static ustring u_particle_age;
 	static ustring u_particle_lifetime;
 	static ustring u_particle_location;
@@ -163,6 +164,7 @@ public:
 	static ustring u_is_curve;
 	static ustring u_curve_thickness;
 	static ustring u_curve_tangent_normal;
+	static ustring u_curve_random;
 	static ustring u_path_ray_length;
 	static ustring u_path_ray_depth;
 	static ustring u_path_diffuse_depth;
@@ -179,6 +181,7 @@ public:
 	static ustring u_u;
 	static ustring u_v;
 	static ustring u_empty;
+	static ustring u_at_bevel;
 
 private:
 	KernelGlobals *kernel_globals;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..6b3a996ca12 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,21 +16,22 @@
 
 #include <OSL/oslexec.h>
 
-#include "kernel_compat_cpu.h"
-#include "kernel_montecarlo.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom_object.h"
+#include "kernel/geom/geom_object.h"
 
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
-#include "attribute.h"
+#include "render/attribute.h"
 
 
 CCL_NAMESPACE_BEGIN
@@ -56,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS
 	tdata->globals.tracedata = &tdata->tracedata;
 	tdata->globals.flipHandedness = false;
 	tdata->osl_thread_info = ss->create_thread_info();
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		tdata->context[i] = ss->get_context(tdata->osl_thread_info);
+	tdata->context = ss->get_context(tdata->osl_thread_info);
 
 	tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
 
@@ -73,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSLThreadData *tdata = kg->osl_tdata;
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		ss->release_context(tdata->context[i]);
+	ss->release_context(tdata->context);
 
 	ss->destroy_thread_info(tdata->osl_thread_info);
 
@@ -172,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -181,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	/* automatic bump shader */
@@ -273,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -282,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 
 	if(kg->osl->background_state) {
 		ss->execute(octx, *(kg->osl->background_state), *globals);
@@ -328,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -337,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->volume_state[shader]) {
@@ -351,19 +348,17 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
+void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
 
-	PathState state = {0};
-
-	shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata);
+	shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
 
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->displacement_state[shader]) {
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index ad06dd6929d..6b392b25cf7 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -29,7 +29,7 @@
  * This means no thread state must be passed along in the kernel itself.
  */
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,10 +53,10 @@ public:
 	static void thread_free(KernelGlobals *kg);
 
 	/* eval */
-	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx);
+	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
 
 	/* attributes */
 	static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b43f8402d42..6ec651a96d8 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -7,6 +7,7 @@ set(SRC_OSL
 	node_anisotropic_bsdf.osl
 	node_attribute.osl
 	node_background.osl
+	node_bevel.osl
 	node_brick_texture.osl
 	node_brightness.osl
 	node_bump.osl
@@ -22,6 +23,8 @@ set(SRC_OSL
 	node_convert_from_point.osl
 	node_convert_from_vector.osl
 	node_diffuse_bsdf.osl
+	node_displacement.osl
+	node_vector_displacement.osl
 	node_emission.osl
 	node_environment_texture.osl
 	node_fresnel.osl
@@ -33,6 +36,7 @@ set(SRC_OSL
 	node_hair_info.osl
 	node_scatter_volume.osl
 	node_absorption_volume.osl
+	node_principled_volume.osl
 	node_holdout.osl
 	node_hsv.osl
 	node_image_texture.osl
@@ -81,13 +85,15 @@ set(SRC_OSL
 	node_wireframe.osl
 	node_hair_bsdf.osl
 	node_uv_map.osl
+	node_principled_bsdf.osl
 	node_rgb_to_bw.osl
 )
 
 set(SRC_OSL_HEADERS
-	node_texture.h
 	node_color.h
 	node_fresnel.h
+	node_ramp_util.h
+	node_texture.h
 	stdosl.h
 	oslutil.h
 )
@@ -99,6 +105,7 @@ set(SRC_OSO
 # TODO, add a module to compile OSL
 foreach(_file ${SRC_OSL})
 	set(_OSL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${_file})
+	set_source_files_properties(${_file} PROPERTIES HEADER_FILE_ONLY TRUE)
 	string(REPLACE ".osl" ".oso" _OSO_FILE ${_OSL_FILE})
 	string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE})
 	add_custom_command(
@@ -113,7 +120,8 @@ foreach(_file ${SRC_OSL})
 	unset(_OSO_FILE)
 endforeach()
 
-add_custom_target(cycles_osl_shaders ALL DEPENDS ${SRC_OSO} ${SRC_OSL_HEADERS} ${OSL_COMPILER})
+add_custom_target(cycles_osl_shaders ALL DEPENDS ${SRC_OSO} ${SRC_OSL_HEADERS} ${OSL_COMPILER} SOURCES ${SRC_OSL})
+cycles_set_solution_folder(cycles_osl_shaders)
 
 # CMAKE_CURRENT_SOURCE_DIR is already included in OSO paths
 delayed_install("" "${SRC_OSO}" ${CYCLES_INSTALL_PATH}/shader)
diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index bef6d7e8809..21e28ece65d 100644
--- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -33,27 +33,28 @@ shader node_anisotropic_bsdf(
 		T = rotate(T, Rotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
 
 	/* compute roughness */
-	float RoughnessU, RoughnessV;
+	float roughness = Roughness * Roughness;
+	float roughness_u, roughness_v;
 	float aniso = clamp(Anisotropy, -0.99, 0.99);
 
 	if (aniso < 0.0) {
-		RoughnessU = Roughness / (1.0 + aniso);
-		RoughnessV = Roughness * (1.0 + aniso);
+		roughness_u = roughness / (1.0 + aniso);
+		roughness_v = roughness * (1.0 + aniso);
 	}
 	else {
-		RoughnessU = Roughness * (1.0 - aniso);
-		RoughnessV = Roughness / (1.0 - aniso);
+		roughness_u = roughness * (1.0 - aniso);
+		roughness_v = roughness / (1.0 - aniso);
 	}
 
 	if (distribution == "sharp")
 		BSDF = Color * reflection(Normal);
 	else if (distribution == "beckmann")
-		BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV);
+		BSDF = Color * microfacet_beckmann_aniso(Normal, T, roughness_u, roughness_v);
 	else if (distribution == "GGX")
-		BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV);
+		BSDF = Color * microfacet_ggx_aniso(Normal, T, roughness_u, roughness_v);
 	else if (distribution == "Multiscatter GGX")
-		BSDF = Color * microfacet_multi_ggx_aniso(Normal, T, RoughnessU, RoughnessV, Color);
+		BSDF = Color * microfacet_multi_ggx_aniso(Normal, T, roughness_u, roughness_v, Color);
 	else
-		BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV);
+		BSDF = Color * ashikhmin_shirley(Normal, T, roughness_u, roughness_v);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_bevel.osl b/intern/cycles/kernel/shaders/node_bevel.osl
new file mode 100644
index 00000000000..9c4ca15be17
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_bevel.osl
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_bevel(
+	int samples = 4,
+	float Radius = 0.05,
+	normal NormalIn = N,
+	output normal NormalOut = N)
+{
+	/* Abuse texture call with special @bevel token. */
+	vector bevel_N = (normal)(color)texture("@bevel", samples, Radius);
+
+	/* Preserve input normal. */
+	NormalOut = normalize(NormalIn + (bevel_N - N));
+}
+
diff --git a/intern/cycles/kernel/shaders/node_displacement.osl b/intern/cycles/kernel/shaders/node_displacement.osl
new file mode 100644
index 00000000000..89f35841527
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_displacement.osl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_displacement(
+	string space = "object",
+	float Height = 0.0,
+	float Midlevel = 0.5,
+	float Scale = 1.0,
+	normal Normal = N,
+	output vector Displacement = vector(0.0, 0.0, 0.0))
+{
+	Displacement = Normal;
+	if(space == "object") {
+		Displacement = transform("object", Displacement);
+	}
+
+	Displacement = normalize(Displacement) * (Height - Midlevel) * Scale;
+
+	if(space == "object") {
+		Displacement = transform("object", "world", Displacement);
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index a9723a8300a..2e713861c58 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -29,16 +29,17 @@ shader node_glass_bsdf(
 	float eta = backfacing() ? 1.0 / f : f;
 	float cosi = dot(I, Normal);
 	float Fr = fresnel_dielectric_cos(cosi, eta);
+	float roughness = Roughness * Roughness;
 
 	if (distribution == "sharp")
 		BSDF = Color * (Fr * reflection(Normal) + (1.0 - Fr) * refraction(Normal, eta));
 	else if (distribution == "beckmann")
-		BSDF = Color * (Fr * microfacet_beckmann(Normal, Roughness) +
-		                (1.0 - Fr) * microfacet_beckmann_refraction(Normal, Roughness, eta));
+		BSDF = Color * (Fr * microfacet_beckmann(Normal, roughness) +
+		                (1.0 - Fr) * microfacet_beckmann_refraction(Normal, roughness, eta));
 	else if (distribution == "Multiscatter GGX")
-		BSDF = Color * microfacet_multi_ggx_glass(Normal, Roughness, eta, Color);
+		BSDF = Color * microfacet_multi_ggx_glass(Normal, roughness, eta, Color);
 	else if (distribution == "GGX")
-		BSDF = Color * (Fr * microfacet_ggx(Normal, Roughness) +
-		                (1.0 - Fr) * microfacet_ggx_refraction(Normal, Roughness, eta));
+		BSDF = Color * (Fr * microfacet_ggx(Normal, roughness) +
+		                (1.0 - Fr) * microfacet_ggx_refraction(Normal, roughness, eta));
 }
 
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index f4ea7e7dc6a..7415211b56d 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -24,16 +24,18 @@ shader node_glossy_bsdf(
 	normal Normal = N,
 	output closure color BSDF = 0)
 {
+	float roughness = Roughness * Roughness;
+
 	if (distribution == "sharp")
 		BSDF = Color * reflection(Normal);
 	else if (distribution == "beckmann")
-		BSDF = Color * microfacet_beckmann(Normal, Roughness);
+		BSDF = Color * microfacet_beckmann(Normal, roughness);
 	else if (distribution == "GGX")
-		BSDF = Color * microfacet_ggx(Normal, Roughness);
+		BSDF = Color * microfacet_ggx(Normal, roughness);
 	else if (distribution == "Multiscatter GGX")
-		BSDF = Color * microfacet_multi_ggx(Normal, Roughness, Color);
+		BSDF = Color * microfacet_multi_ggx(Normal, roughness, Color);
 	else
-		BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness);
+		BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), roughness, roughness);
 
 }
 
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index 965d2a3c7f7..19216f67579 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -20,11 +20,13 @@ shader node_hair_info(
 	output float IsStrand = 0.0,
 	output float Intercept = 0.0,
 	output float Thickness = 0.0,
-	output normal TangentNormal = N)
+	output normal TangentNormal = N,
+	output float Random = 0)
 {
 	getattribute("geom:is_curve", IsStrand);
 	getattribute("geom:curve_intercept", Intercept);
 	getattribute("geom:curve_thickness", Thickness);
 	getattribute("geom:curve_tangent_normal", TangentNormal);
+	getattribute("geom:curve_random", Random);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl
index 294b8dd6bf2..5dbef0244fe 100644
--- a/intern/cycles/kernel/shaders/node_output_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_output_displacement.osl
@@ -16,10 +16,8 @@
 
 #include "stdosl.h"
 
-displacement node_output_displacement(float Displacement = 0.0)
+displacement node_output_displacement(vector Displacement = 0.0)
 {
-	vector dP = normalize(transform("object", N));
-	dP *= Displacement * 0.1; /* todo: get rid of this factor */
-	P += transform("object", "world", dP);
+	P += Displacement;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl
index 768b7753d02..2a0252d5e45 100644
--- a/intern/cycles/kernel/shaders/node_particle_info.osl
+++ b/intern/cycles/kernel/shaders/node_particle_info.osl
@@ -18,6 +18,7 @@
 
 shader node_particle_info(
     output float Index = 0.0,
+    output float Random = 0.0,
     output float Age = 0.0,
     output float Lifetime = 0.0,
     output point Location = point(0.0, 0.0, 0.0),
@@ -26,6 +27,7 @@ shader node_particle_info(
     output vector AngularVelocity = point(0.0, 0.0, 0.0))
 {
 	getattribute("particle:index", Index);
+	getattribute("particle:random", Random);
 	getattribute("particle:age", Age);
 	getattribute("particle:lifetime", Lifetime);
 	getattribute("particle:location", Location);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
new file mode 100644
index 00000000000..6f54ba3a462
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+#include "node_fresnel.h"
+
+shader node_principled_bsdf(
+	string distribution = "Multiscatter GGX",
+	string subsurface_method = "burley",
+	color BaseColor = color(0.8, 0.8, 0.8),
+	float Subsurface = 0.0,
+	vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
+	color SubsurfaceColor = color(0.7, 0.1, 0.1),
+	float Metallic = 0.0,
+	float Specular = 0.5,
+	float SpecularTint = 0.0,
+	float Roughness = 0.5,
+	float Anisotropic = 0.0,
+	float AnisotropicRotation = 0.0,
+	float Sheen = 0.0,
+	float SheenTint = 0.5,
+	float Clearcoat = 0.0,
+	float ClearcoatRoughness = 0.03,
+	float IOR = 1.45,
+	float Transmission = 0.0,
+	float TransmissionRoughness = 0.0,
+	normal Normal = N,
+	normal ClearcoatNormal = N,
+	normal Tangent = normalize(dPdu),
+	output closure color BSDF = 0)
+{
+	float f = max(IOR, 1e-5);
+	float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+	float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+	float specular_weight = (1.0 - final_transmission);
+
+	vector T = Tangent;
+
+	float m_cdlum = luminance(BaseColor);
+	color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat
+
+	/* rotate tangent */
+	if (AnisotropicRotation != 0.0)
+		T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
+
+	if (diffuse_weight > 1e-5) {
+		if (Subsurface > 1e-5) {
+			color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+			if (subsurface_method == "burley") {
+				BSDF = mixed_ss_base_color * bssrdf("principled", Normal, Subsurface * SubsurfaceRadius, SubsurfaceColor, "roughness", Roughness);
+			}
+			else {
+				BSDF = mixed_ss_base_color * bssrdf("principled_random_walk", Normal, Subsurface * SubsurfaceRadius, mixed_ss_base_color, "roughness", Roughness);
+			}
+		}
+		else {
+			BSDF = BaseColor * principled_diffuse(Normal, Roughness);
+		}
+
+		if (Sheen > 1e-5) {
+			color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint;
+
+			BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal);
+		}
+
+		BSDF = BSDF * diffuse_weight;
+	}
+
+	if (specular_weight > 1e-5) {
+		float aspect = sqrt(1.0 - Anisotropic * 0.9);
+		float r2 = Roughness * Roughness;
+
+		float alpha_x = r2 / aspect;
+		float alpha_y = r2 * aspect;
+
+		color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
+
+		color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic;
+
+		if (distribution == "GGX" || Roughness <= 0.075) {
+			BSDF = BSDF  + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		} else {
+			BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		}
+	}
+
+	if (final_transmission > 1e-5) {
+		color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
+		float eta = backfacing() ? 1.0 / f : f;
+
+		if (distribution == "GGX" || Roughness <= 5e-2) {
+			float cosNO = dot(Normal, I);
+			float Fr = fresnel_dielectric_cos(cosNO, eta);
+
+			float refl_roughness = Roughness;
+			if (Roughness <= 1e-2)
+				refl_roughness = 0.0;
+
+			float transmission_roughness = refl_roughness;
+			if (distribution == "GGX")
+				transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
+
+			BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+			       (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
+		} else {
+			BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+		}
+	}
+
+	if (Clearcoat > 1e-5) {
+		BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/node_principled_volume.osl b/intern/cycles/kernel/shaders/node_principled_volume.osl
new file mode 100644
index 00000000000..ea8d6ab12c5
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_volume.osl
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_principled_volume(
+	color Color = color(0.5, 0.5, 0.5),
+	float Density = 1.0,
+	float Anisotropy = 0.0,
+	color AbsorptionColor = color(0.0, 0.0, 0.0),
+	float EmissionStrength = 0.0,
+	color EmissionColor = color(1.0, 1.0, 1.0),
+	float BlackbodyIntensity = 0.0,
+	color BlackbodyTint = color(1.0, 1.0, 1.0),
+	float Temperature = 1500.0,
+	string DensityAttribute = "geom:density",
+	string ColorAttribute = "geom:color",
+	string TemperatureAttribute = "geom:temperature",
+	output closure color Volume = 0)
+{
+	/* Compute density. */
+	float primitive_density = 1.0;
+	float density = max(Density, 0.0);
+
+	if(density > 1e-5) {
+		if(getattribute(DensityAttribute, primitive_density)) {
+			density = max(density * primitive_density, 0.0);
+		}
+	}
+
+	if(density > 1e-5) {
+		/* Compute scattering color. */
+		color scatter_color = Color;
+		color primitive_color;
+		if(getattribute(ColorAttribute, primitive_color)) {
+			scatter_color *= primitive_color;
+		}
+
+		/* Add scattering and absorption closures. */
+		color scatter_coeff = scatter_color;
+		color absorption_color = sqrt(max(AbsorptionColor, 0.0));
+		color absorption_coeff = max(1.0 - scatter_color, 0.0) * max(1.0 - absorption_color, 0.0);
+		Volume = scatter_coeff * density * henyey_greenstein(Anisotropy) +
+		         absorption_coeff * density * absorption();
+	}
+
+	/* Compute emission. */
+	float emission_strength = max(EmissionStrength, 0.0);
+	float blackbody_intensity = BlackbodyIntensity;
+
+	if(emission_strength > 1e-5) {
+		Volume += emission_strength * EmissionColor * emission();
+	}
+
+	if(blackbody_intensity > 1e-3) {
+		float T = Temperature;
+
+		/* Add temperature from attribute if available. */
+		float temperature;
+		if(getattribute(TemperatureAttribute, temperature)) {
+			T *= max(temperature, 0.0);
+		}
+
+		T = max(T, 0.0);
+
+		/* Stefan-Boltzman law. */
+		float T4 = (T * T) * (T * T);
+		float sigma = 5.670373e-8 * 1e-6 / M_PI;
+		float intensity = sigma * mix(1.0, T4, blackbody_intensity);
+
+		if(intensity > 1e-5) {
+			color bb = blackbody(T);
+			float l = luminance(bb);
+
+			if(l != 0.0) {
+				bb *= BlackbodyTint * intensity / l;
+				Volume += bb * emission();
+			}
+		}
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index 828becf1818..eaab7282243 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -26,12 +26,13 @@ shader node_refraction_bsdf(
 {
 	float f = max(IOR, 1e-5);
 	float eta = backfacing() ? 1.0 / f : f;
+	float roughness = Roughness * Roughness;
 
 	if (distribution == "sharp")
 		BSDF = Color * refraction(Normal, eta);
 	else if (distribution == "beckmann")
-		BSDF = Color * microfacet_beckmann_refraction(Normal, Roughness, eta);
+		BSDF = Color * microfacet_beckmann_refraction(Normal, roughness, eta);
 	else if (distribution == "GGX")
-		BSDF = Color * microfacet_ggx_refraction(Normal, Roughness, eta);
+		BSDF = Color * microfacet_ggx_refraction(Normal, roughness, eta);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index 5ba8f34021d..0df3256e1fd 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -27,10 +27,12 @@ shader node_subsurface_scattering(
 	output closure color BSSRDF = 0)
 {
 	if (falloff == "gaussian")
-		BSSRDF = Color * bssrdf_gaussian(Normal, Scale * Radius, TextureBlur);
+		BSSRDF = Color * bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
 	else if (falloff == "cubic")
-		BSSRDF = Color * bssrdf_cubic(Normal, Scale * Radius, TextureBlur, Sharpness);
+		BSSRDF = Color * bssrdf("cubic", Normal, Scale * Radius, Color, "texture_blur", TextureBlur, "sharpness", Sharpness);
+	else if (falloff == "burley")
+		BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
 	else
-		BSSRDF = Color * bssrdf_burley(Normal, Scale * Radius, TextureBlur, Color);
+		BSSRDF = Color * bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_vector_displacement.osl b/intern/cycles/kernel/shaders/node_vector_displacement.osl
new file mode 100644
index 00000000000..b19bc228e37
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_vector_displacement.osl
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_vector_displacement(
+	color Vector = color(0.0, 0.0, 0.0),
+	float Midlevel = 0.0,
+	float Scale = 1.0,
+	string space = "tangent",
+	string attr_name = "geom:tangent",
+	string attr_sign_name = "geom:tangent_sign",
+	output vector Displacement = vector(0.0, 0.0, 0.0))
+{
+	vector offset = (Vector - vector(Midlevel)) * Scale;
+
+	if(space == "tangent") {
+		/* Tangent space. */
+		vector N_object = normalize(transform("world", "object", N));
+
+		vector T_object;
+		if(getattribute(attr_name, T_object)) {
+			T_object = normalize(T_object);
+		}
+		else {
+			T_object = normalize(dPdu);
+		}
+
+		vector B_object = normalize(cross(N_object, T_object));
+		float tangent_sign;
+		if(getattribute(attr_sign_name, tangent_sign)) {
+			B_object *= tangent_sign;
+		}
+
+		Displacement = T_object*offset[0] + N_object*offset[1] + B_object*offset[2];
+	}
+	else {
+		/* Object or world space. */
+		Displacement = offset;
+	}
+
+	if(space != "world") {
+		/* Tangent or object space. */
+		Displacement = transform("object", "world", Displacement);
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index a8dda8a12c9..091ade4a60d 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
 closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
 closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
 closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
 closure color microfacet_beckmann(normal N, float ab) BUILTIN;
 closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
 closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
@@ -539,11 +544,12 @@ closure color emission() BUILTIN;
 closure color background() BUILTIN;
 closure color holdout() BUILTIN;
 closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
 
 // BSSRDF
-closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
-closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN;
-closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN;
+closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN;
 
 // Hair
 closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
deleted file mode 100644
index 9bfa71c75ef..00000000000
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                      |--- ray_state
- * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
- * parallel_samples -------------------------------------|                                      |--- PathState_coop
- * end_sample -------------------------------------------|                                      |--- throughput_coop
- * kg (globals) -----------------------------------------|                                      |--- rng_coop
- * rng_state --------------------------------------------|                                      |--- Ray
- * PathRadiance_coop ------------------------------------|                                      |
- * sw ---------------------------------------------------|                                      |
- * sh ---------------------------------------------------|                                      |
- * sx ---------------------------------------------------|                                      |
- * sy ---------------------------------------------------|                                      |
- * stride -----------------------------------------------|                                      |
- * work_array -------------------------------------------|                                      |--- work_array
- * queuesize --------------------------------------------|                                      |
- * start_sample -----------------------------------------|                                      |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                      |
- * num_samples ------------------------------------------|                                      |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-ccl_device char kernel_background_buffer_update(
-        KernelGlobals *kg,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index)
-{
-	char enqueue_flag = 0;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	ccl_global PathState *state = &PathState_coop[ray_index];
-	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-	ccl_global Ray *ray = &Ray_coop[ray_index];
-	ccl_global float3 *throughput = &throughput_coop[ray_index];
-	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-	ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	ccl_global float *initial_per_sample_output_buffers;
-	ccl_global uint *initial_rng;
-#endif
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-	my_work = work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-	get_pixel_tile_position(&pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        my_work,
-	                        sw, sh, sx, sy,
-	                        parallel_samples,
-	                        ray_index);
-	my_sample_tile = 0;
-	initial_per_sample_output_buffers = per_sample_output_buffers;
-	initial_rng = rng_state;
-#else  /* __WORK_STEALING__ */
-	sample = work_array[ray_index];
-	int tile_index = ray_index / parallel_samples;
-	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-	tile_x = tile_index % sw;
-	tile_y = tile_index / sw;
-	my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-
-	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
-			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
-		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-		path_rng_end(kg, rng_state, *rng);
-
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		if(!valid_work) {
-			/* If work is invalid, this means no more work is available and the thread may exit */
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#else  /* __WORK_STEALING__ */
-		if((sample + parallel_samples) >= end_sample) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#endif  /* __WORK_STEALING__ */
-
-		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			work_array[ray_index] = my_work;
-			/* Get the sample associated with the current work */
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
-
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = initial_per_sample_output_buffers
-				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else  /* __WORK_STEALING__ */
-			work_array[ray_index] = sample + parallel_samples;
-			sample = work_array[ray_index];
-
-			/* Get ray position from ray index */
-			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-			pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif  /* __WORK_STEALING__ */
-
-			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
-			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
-				 * These rays proceed with path-iteration.
-				 */
-				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
-				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, kg->sd_input, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-				enqueue_flag = 1;
-			}
-			else {
-				/* These rays do not participate in path-iteration. */
-				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-				path_rng_end(kg, rng_state, *rng);
-
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-			}
-		}
-	}
-	return enqueue_flag;
-}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..368a4395760
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+		branched_state->name = kernel_split_state.name[ray_index];
+
+	BRANCHED_STORE(path_state);
+	BRANCHED_STORE(throughput);
+	BRANCHED_STORE(ray);
+	BRANCHED_STORE(isect);
+	BRANCHED_STORE(ray_state);
+
+	*kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
+	for(int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
+		kernel_split_sd(branched_state_sd, ray_index)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
+	}
+
+#undef BRANCHED_STORE
+
+	/* set loop counters to intial position */
+	branched_state->next_closure = 0;
+	branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* restore state */
+#define BRANCHED_RESTORE(name) \
+		kernel_split_state.name[ray_index] = branched_state->name;
+
+	BRANCHED_RESTORE(path_state);
+	BRANCHED_RESTORE(throughput);
+	BRANCHED_RESTORE(ray);
+	BRANCHED_RESTORE(isect);
+	BRANCHED_RESTORE(ray_state);
+
+	*kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
+	for(int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
+		kernel_split_sd(sd, ray_index)->closure[i] = kernel_split_sd(branched_state_sd, ray_index)->closure[i];
+	}
+
+#undef BRANCHED_RESTORE
+
+	/* leave indirect loop */
+	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+		kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+	if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+		return false;
+	}
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+		if(num) { \
+			kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
+		}
+	SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+	*kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
+	for(int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
+		kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
+	}
+
+	kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+	kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+	kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+	path_radiance_copy_indirect(inactive_L, L);
+
+	ray_state[inactive_ray] = RAY_REGENERATED;
+	ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+	ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+	atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+	return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state,
+                                                                                bool wait_for_shared)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = saved_sd;
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	float3 throughput = branched_state->throughput;
+	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(ps->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+
+		for(int j = branched_state->next_sample; j < num_samples; j++) {
+			if(reset_path_state) {
+				*ps = branched_state->path_state;
+			}
+
+			ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+			*tp = throughput;
+
+			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+			if(!kernel_branched_path_surface_bounce(kg,
+			                                        sd,
+			                                        sc,
+			                                        j,
+			                                        num_samples,
+			                                        tp,
+			                                        ps,
+			                                        &L->state,
+			                                        bsdf_ray,
+			                                        sum_sample_weight))
+			{
+				continue;
+			}
+
+			ps->rng_hash = branched_state->path_state.rng_hash;
+
+			/* update state for next iteration */
+			branched_state->next_closure = i;
+			branched_state->next_sample = j+1;
+
+			/* start the indirect path */
+			*tp *= num_samples_inv;
+
+			if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+				continue;
+			}
+
+			return true;
+		}
+
+		branched_state->next_sample = 0;
+	}
+
+	branched_state->next_closure = sd->num_closure;
+
+	if(wait_for_shared) {
+		branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+		if(branched_state->waiting_on_shared_samples) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
new file mode 100644
index 00000000000..180c0b57077
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queues when this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ */
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+                                     ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		uint sample = state->sample;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+		/* accumulate result in output buffer */
+		kernel_write_result(kg, buffer, sample, L);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+		/* We have completed current work; So get next work */
+		ccl_global uint *work_pools = kernel_split_params.work_pools;
+		uint total_work_size = kernel_split_params.total_work_size;
+		uint work_index;
+
+		if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+			ccl_global WorkTile *tile = &kernel_split_params.tile;
+			uint x, y, sample;
+			get_work_pixel(tile, work_index, &x, &y, &sample);
+
+			/* Store buffer offset for writing to passes. */
+			uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+			kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+			/* Initialize random numbers and ray. */
+			uint rng_hash;
+			kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, path radiance, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg,
+				                AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
+				                state,
+				                rng_hash,
+				                sample,
+				                ray);
+#ifdef __SUBSURFACE__
+				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			}
+			else {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..77fb61b80a8 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,221 +14,96 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
+/* This kernel Initializes structures needed in path-iteration kernels.
  *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                  |--- Initialized throughput
- * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                  |--- Initialized Ray
- * Un-initialized PathState --------|                                  |--- Initialized PathState
- * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                  |--- Initialized ray_state
- * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                  |--- Initialized work_array
- * data ----------------------------|                                  |--- Initialized work_pool_wgs
- * start_sample --------------------|                                  |
- * sx ------------------------------|                                  |
- * sy ------------------------------|                                  |
- * sw ------------------------------|                                  |
- * sh ------------------------------|                                  |
- * stride --------------------------|                                  |
- * queuesize -----------------------|                                  |
- * num_samples ---------------------|                                  |
- *
- * Note on Queues :
+ * Note on Queues:
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
+
+#ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
         KernelGlobals *kg,
-        ShaderData *sd_DL_shadow,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "../kernel_textures.h"
-
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+
+#ifdef __KERNEL_OPENCL__
+		KERNEL_BUFFER_PARAMS,
+#endif
+
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global unsigned int *work_pools,      /* Work pool for each work group */
+        unsigned int num_samples,
+        ccl_global float *buffer)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
+#ifdef __KERNEL_OPENCL__
 	kg->data = data;
-	kg->sd_input = sd_DL_shadow;
-	kg->isect_shadow = Intersection_coop_shadow;
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../kernel_textures.h"
-
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif  /* __WORK_STEALING__ */
+#endif
 
-	/* Initialize queue data and queue index. */
-	if(thread_index < queuesize) {
-		/* Initialize active ray queue. */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue. */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-	}
+	kernel_split_params.tile.x = sx;
+	kernel_split_params.tile.y = sy;
+	kernel_split_params.tile.w = sw;
+	kernel_split_params.tile.h = sh;
 
-	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-		/* The scene-intersect kernel should not use the queues very first time.
-		 * since the queue would be empty.
-		 */
-		use_queues_flag[0] = 0;
-	}
+	kernel_split_params.tile.start_sample = start_sample;
+	kernel_split_params.tile.num_samples = num_samples;
 
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+	kernel_split_params.tile.offset = offset;
+	kernel_split_params.tile.stride = stride;
 
-	if(x < (sw * parallel_samples) && y < sh) {
-		int ray_index = x + y * (sw * parallel_samples);
+	kernel_split_params.tile.buffer = buffer;
 
-		/* This is the first assignment to ray_state;
-		 * So we dont use ASSIGN_RAY_STATE macro.
-		 */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* Get work. */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work. */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work. */
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		work_array[ray_index] = my_work;
-#else  /* __WORK_STEALING__ */
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array. */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray. */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif  /* __WORK_STEALING__ */
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros. */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0;
-		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
-		    per_sample_output_buffers_iterator++)
-		{
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
+	kernel_split_params.total_work_size = sw * sh * num_samples;
+
+	kernel_split_params.work_pools = work_pools;
 
-		/* Initialize random numbers and ray. */
-		kernel_path_trace_setup(kg,
-		                        rng_state,
-		                        my_sample,
-		                        pixel_x, pixel_y,
-		                        &rng_coop[ray_index],
-		                        &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throughput, L_transparent, Ray, PathState;
-			 * These rays proceed with path-iteration.
-			 */
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg,
-			                kg->sd_input,
-			                &PathState_coop[ray_index],
-			                &rng_coop[ray_index],
-			                my_sample,
-			                &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
+	kernel_split_params.queue_index = Queue_index;
+	kernel_split_params.queue_size = queuesize;
+	kernel_split_params.use_queues_flag = use_queues_flag;
+
+	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 #endif
-		}
-		else {
-			/* These rays do not participate in path-iteration. */
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* Accumulate result in output buffer. */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	/* Initialize queue data and queue index. */
+	if(thread_index < queuesize) {
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		}
 	}
 
-	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	if(thread_index == 0) {
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			Queue_index[i] = 0;
+		}
+
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		*use_queues_flag = 0;
 	}
+#endif  /* KERENL_STUB */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..ca79602c565 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,95 +14,145 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
  * in the next kernel.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
  *
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                             |--- ISLamp_coop
- * sd -----------------------------------------------|                             |--- LightRay_coop
- * ray_state ----------------------------------------|                             |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
- * kg (globals) -------------------------------------|                             |
- * queuesize ----------------------------------------|                             |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ *   kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ *   shadow_blocked function must be executed, after this kernel call
+ *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device char kernel_direct_lighting(
-        KernelGlobals *kg,
-        ShaderData *sd,                         /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+                                       ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
 	char enqueue_flag = 0;
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		if((kernel_data.integrator.use_direct_light &&
-		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
-		{
+		bool flag = (kernel_data.integrator.use_direct_light &&
+		             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+		if(flag && kernel_data.integrator.branched) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __BRANCHED_PATH__ */
+
+#  ifdef __SHADOW_TRICKS__
+		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __SHADOW_TRICKS__ */
+
+		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			ccl_global RNG* rng = &rng_coop[ray_index];
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, state);
 
 			LightSample ls;
 			if(light_sample(kg,
-			                light_t, light_u, light_v,
-			                ccl_fetch(sd, time),
-			                ccl_fetch(sd, P),
+			                light_u, light_v,
+			                sd->time,
+			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
-#endif
+				light_ray.time = sd->time;
 
 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				if(direct_emission(kg,
+				                   sd,
+				                   AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
+				                   &ls,
+				                   state,
+				                   &light_ray,
+				                   &L_light,
+				                   &is_lamp,
+				                   terminate))
+				{
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+					kernel_split_state.light_ray[ray_index] = light_ray;
+					kernel_split_state.bsdf_eval[ray_index] = L_light;
+					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
 		}
 #endif  /* __EMISSION__ */
 	}
-	return enqueue_flag;
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
+
+#ifdef __BRANCHED_PATH__
+	/* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+	 * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+	 */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_LIGHT_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
new file mode 100644
index 00000000000..fb5bd3d48dd
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = kernel_split_sd(sd, ray_index);
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
+	/* GPU: no decoupled ray marching, scatter probalistically */
+	int num_samples = kernel_data.integrator.volume_samples;
+	float num_samples_inv = 1.0f/num_samples;
+
+	Ray volume_ray = branched_state->ray;
+	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+	for(int j = branched_state->next_sample; j < num_samples; j++) {
+		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+		*ps = branched_state->path_state;
+
+		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+		*pray = branched_state->ray;
+
+		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+		*tp = branched_state->throughput * num_samples_inv;
+
+		/* branch RNG state */
+		path_state_branch(ps, j, num_samples);
+
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, ps, sd, &volume_ray, L, tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+			/* indirect light bounce */
+			if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
+				continue;
+			}
+
+			/* start the indirect path */
+			branched_state->next_closure = 0;
+			branched_state->next_sample = j+1;
+
+			/* Attempting to share too many samples is slow for volumes as it causes us to
+			 * loop here more and have many calls to kernel_volume_integrate which evaluates
+			 * shaders. The many expensive shader evaluations cause the work load to become
+			 * unbalanced and many threads to become idle in this kernel. Limiting the
+			 * number of shared samples here helps quite a lot.
+			 */
+			if(branched_state->shared_sample_count < 2) {
+				if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+					continue;
+				}
+			}
+
+			return true;
+		}
+#  endif
+	}
+
+	branched_state->next_sample = num_samples;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	/* todo: avoid this calculation using decoupled ray marching */
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+	kernel_split_state.throughput[ray_index] = throughput;
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
+
+ccl_device void kernel_do_volume(KernelGlobals *kg)
+{
+#ifdef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(*kernel_split_params.use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          1);
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
+		ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
+		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state->volume_stack);
+		}
+		/* volume attenuation, emission, scatter */
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = *ray;
+			volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+#  ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+				{
+					/* integrate along volume segment with distance sampling */
+					VolumeIntegrateResult result = kernel_volume_integrate(
+						kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+					if(result == VOLUME_PATH_SCATTERED) {
+						/* direct lighting */
+						kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+						/* indirect light bounce */
+						if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
+							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+						}
+						else {
+							kernel_split_path_end(kg, ray_index);
+						}
+					}
+#  endif  /* __VOLUME_SCATTER__ */
+				}
+
+#  ifdef __BRANCHED_PATH__
+			}
+			else {
+				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#  endif  /* __BRANCHED_PATH__ */
+		}
+	}
+
+#  ifdef __BRANCHED_PATH__
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_VOLUME_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+                                        ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+	/* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	char enqueue_flag = 0;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_INACTIVE_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif  /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 5d951b972ed..88919f47c7a 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,247 +14,161 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
  *
  * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                           |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                                           |--- ShaderData
- * ray_state --------------------------------------------|                                                           |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
- * kg (globals) -----------------------------------------|                                                           |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                           |
- * sw ---------------------------------------------------|                                                           |
- * sh ---------------------------------------------------|                                                           |
- * sx ---------------------------------------------------|                                                           |
- * sy ---------------------------------------------------|                                                           |
- * stride -----------------------------------------------|                                                           |
- * work_array -------------------------------------------|                                                           |
- * queuesize --------------------------------------------|                                                           |
- * start_sample -----------------------------------------|                                                           |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
-        ShaderData *sd,                        /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index,
-        char *enqueue_flag,
-        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+        ccl_local_param BackgroundAOLocals *locals)
 {
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		locals->queue_atomics_bg = 0;
+		locals->queue_atomics_ao = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+#ifdef __AO__
+	char enqueue_flag = 0;
+#endif
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
-	unsigned int tile_x;
-	unsigned int tile_y;
-	int my_sample_tile;
-	unsigned int sample;
 
-	ccl_global RNG *rng = 0x0;
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-
-		throughput = throughput_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		my_sample_tile = 0;
-#else  /* __WORK_STEALING__ */
-		sample = work_array[ray_index];
-		/* Buffer's stride is "stride"; Find x and y using ray_index. */
-		int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-		per_sample_output_buffers +=
-		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ShaderData *sd = kernel_split_sd(sd, ray_index);
 
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(((ccl_fetch(sd, flag) & SD_HOLDOUT) ||
-		    (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) &&
-		   (state->flag & PATH_RAY_CAMERA))
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		throughput = kernel_split_state.throughput[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
 		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
-			}
-			if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				*enqueue_flag = 1;
-			}
+			kernel_split_path_end(kg, ray_index);
 		}
-#endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		/* Holdout mask objects do not write data passes. */
-		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
-		                         L,
-		                         sd,
-		                         sample,
-		                         state,
-		                         throughput);
-		/* Blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy.
-		 */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(ccl_fetch(sd, flag) & SD_EMISSION) {
-			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(
-			        kg,
-			        sd,
-			        Intersection_coop[ray_index].t,
-			        state->flag,
-			        state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
-		}
-#endif  /* __EMISSION__ */
-
 		/* Path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
-		float probability = path_state_terminate_probability(kg, state, throughput);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			*enqueue_flag = 1;
+			kernel_split_path_end(kg, ray_index);
+		}
+		else if(probability < 1.0f) {
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+			if(terminate >= probability) {
+				kernel_split_path_end(kg, ray_index);
+			}
+			else {
+				kernel_split_state.throughput[ray_index] = throughput/probability;
+			}
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
-				if(terminate >= probability) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					*enqueue_flag = 1;
-				}
-				else {
-					throughput_coop[ray_index] = throughput/probability;
-				}
-			}
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
 
 #ifdef __AO__
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (ccl_fetch(sd, flag) & SD_AO))
-		{
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-
-			float3 ao_D;
-			float ao_pdf;
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray _ray;
-				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-				_ray.D = ao_D;
-				_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				_ray.time = ccl_fetch(sd, time);
-#endif
-				_ray.dP = ccl_fetch(sd, dP);
-				_ray.dD = differential3_zero();
-				AOLightRay_coop[ray_index] = _ray;
-
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-			}
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+			enqueue_flag = 1;
 		}
 	}
 #endif  /* __AO__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_ao,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
new file mode 100644
index 00000000000..4cf88a02590
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_background(KernelGlobals *kg)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int ray_index;
+
+	if(kernel_data.integrator.ao_bounces != INT_MAX) {
+		ray_index = get_ray_index(kg, thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index != QUEUE_EMPTY_SLOT) {
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+				if(path_state_ao_bounce(kg, state)) {
+					kernel_split_path_end(kg, ray_index);
+				}
+			}
+		}
+	}
+
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
+
+		kernel_path_background(kg, state, ray, throughput, sd, L);
+		kernel_split_path_end(kg, ray_index);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
new file mode 100644
index 00000000000..236c94e983c
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index;
+	get_ray_index(kg, thread_index,
+	              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __SUBSURFACE__
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
+		 * stack memory than invoking kernel_path_indirect.
+		 */
+		if(ss_indirect->num_rays) {
+			kernel_path_subsurface_setup_indirect(kg,
+			                                      ss_indirect,
+			                                      state,
+			                                      ray,
+			                                      L,
+			                                      throughput);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#endif  /* __SUBSURFACE__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..c14f66f664f 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,70 +14,55 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
  * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------|                           |
- * Intersection_coop ----------------------------------|                           |
- * ray_state ------------------------------------------|                           |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
- * queuesize ------------------------------------------|                           |
- * use_queues_flag ------------------------------------|                           |
- * sw -------------------------------------------------|                           |
- * sh -------------------------------------------------|                           |
  */
-ccl_device void kernel_lamp_emission(
-        KernelGlobals *kg,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
-	{
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
+#ifndef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+#endif
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+#ifndef __VOLUME__
+		                          1
+#else
+		                          0
+#endif
+		                          );
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
+	{
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += Intersection_coop[ray_index].t;
-			light_ray.D = ray.D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-			/* intersect with lamp */
-			float3 emission;
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
 
-			if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..e388955f1af 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,128 +14,251 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
  *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
  *
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
- * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                 |--- PathState_coop
- * ray_state --------------------------------------------|                                 |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                 |
- * kg (globals) -----------------------------------------|                                 |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
-ccl_device char kernel_next_iteration_setup(
-        KernelGlobals *kg,
-        ShaderData *sd,                       /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-        int ray_index)
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
 {
-	char enqueue_flag = 0;
-
-	/* Load ShaderData structure. */
-	PathRadiance *L = NULL;
-	ccl_global PathState *state = NULL;
-
-	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-		float3 _throughput = throughput_coop[ray_index];
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = LightRay_ao_coop[ray_index].P;
-			char update_path_radiance = LightRay_ao_coop[ray_index].t;
-			if(update_path_radiance) {
-				path_radiance_accum_ao(L,
-				                       _throughput,
-				                       AOAlpha_coop[ray_index],
-				                       AOBSDF_coop[ray_index],
-				                       shadow,
-				                       state->bounce);
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ShaderData *sd = kernel_split_sd(sd, ray_index);
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+#  ifdef __VOLUME__
+	if(!(sd->flag & SD_HAS_ONLY_VOLUME)) {
+#  endif
+		/* continue in case of transparency */
+		*throughput *= shader_bsdf_transparency(kg, sd);
+
+		if(is_zero(*throughput)) {
+			kernel_split_path_end(kg, ray_index);
+			return;
+		}
+
+		/* Update Path State */
+		path_state_next(kg, state, LABEL_TRANSPARENT);
+#  ifdef __VOLUME__
+	}
+	else {
+		if(!path_state_volume_next(kg, state)) {
+			kernel_split_path_end(kg, ray_index);
+			return;
+		}
+	}
+#  endif
+
+	ray->P = ray_offset(sd->P, -sd->Ng);
+	ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+	ray->dP = sd->dP;
+	ray->dD.dx = -sd->dI.dx;
+	ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+	/* enter/exit volume */
+	kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+}
+#endif  /* __BRANCHED_PATH__ */
+
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+                                            ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		*kernel_split_params.use_queues_flag = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#  ifdef __VOLUME__
+	/* Reactivate only volume rays here, most surface work was skipped. */
+	if(IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+	}
+#  endif
+
+	bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(active) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+			/* Compute direct lighting and next bounce. */
+			if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
+				kernel_split_path_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
+		}
+		else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+			kernel_split_branched_transparent_bounce(kg, ray_index);
 		}
+		else {
+			kernel_split_branched_indirect_light_init(kg, ray_index);
 
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = LightRay_dl_coop[ray_index].P;
-			char update_path_radiance = LightRay_dl_coop[ray_index].t;
-			if(update_path_radiance) {
-				BsdfEval L_light = BSDFEval_coop[ray_index];
-				path_radiance_accum_light(L,
-				                          _throughput,
-				                          &L_light,
-				                          shadow,
-				                          1.0f,
-				                          state->bounce,
-				                          ISLamp_coop[ray_index]);
+			if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+			                                                          ray_index,
+			                                                          1.0f,
+			                                                          kernel_split_sd(branched_state_sd, ray_index),
+			                                                          true,
+			                                                          true))
+			{
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
+			else {
+				kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+				kernel_split_branched_transparent_bounce(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
+#endif  /* __BRANCHED_PATH__ */
+	}
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+	/* iter loop */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
 	}
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG *rng = &rng_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-
-		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_LIGHT_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
+
+		if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+		                                                          ray_index,
+		                                                          1.0f,
+		                                                          kernel_split_sd(branched_state_sd, ray_index),
+		                                                          true,
+		                                                          true))
+		{
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 		}
+		else {
+			kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+			kernel_split_branched_transparent_bounce(kg, ray_index);
+		}
+	}
+
+#  ifdef __VOLUME__
+	/* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
 	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-	return enqueue_flag;
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_VOLUME_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+	/* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..fdd54225b07
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	/* This is the first assignment to ray_state;
+	 * So we dont use ASSIGN_RAY_STATE macro.
+	 */
+	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+	/* Get work. */
+	ccl_global uint *work_pools = kernel_split_params.work_pools;
+	uint total_work_size = kernel_split_params.total_work_size;
+	uint work_index;
+
+	if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+		/* No more work, mark ray as inactive */
+		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+		return;
+	}
+
+	ccl_global WorkTile *tile = &kernel_split_params.tile;
+	uint x, y, sample;
+	get_work_pixel(tile, work_index, &x, &y, &sample);
+
+	/* Store buffer offset for writing to passes. */
+	uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+	kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+	/* Initialize random numbers and ray. */
+	uint rng_hash;
+	kernel_path_trace_setup(kg,
+	                        sample,
+	                        x, y,
+	                        &rng_hash,
+	                        &kernel_split_state.ray[ray_index]);
+
+	if(kernel_split_state.ray[ray_index].t != 0.0f) {
+		/* Initialize throughput, path radiance, Ray, PathState;
+		 * These rays proceed with path-iteration.
+		 */
+		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+		path_state_init(kg,
+		                AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
+		                &kernel_split_state.path_state[ray_index],
+		                rng_hash,
+		                sample,
+		                &kernel_split_state.ray[ray_index]);
+#ifdef __SUBSURFACE__
+		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+	}
+	else {
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..df67fabab19
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
+ *
+ * 1. Rays that have been determined to hit the background from the
+ *    "kernel_scene_intersect" kernel are enqueued in
+ *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queue during other times this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ *     and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+                                     ccl_local_param QueueEnqueueLocals *locals)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(lidx == 0) {
+		locals->queue_atomics[0] = 0;
+		locals->queue_atomics[1] = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
+	        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+		locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  kernel_split_params.queue_size,
+		                                  my_lqidx,
+		                                  locals->queue_atomics);
+		kernel_split_state.queue_data[my_gqidx] = ray_index;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 2388580051f..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,120 +14,66 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
  *
  * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
  * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                          |--- Intersection
- * ray_state --------------------------------------|                                          |--- ray_state
- * use_queues_flag --------------------------------|                                          |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
- * kg (globals) -----------------------------------|                                          |
- * rng_coop ---------------------------------------|                                          |
- * sw ---------------------------------------------|                                          |
- * sh ---------------------------------------------|                                          |
- * queuesize --------------------------------------|                                          |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
  */
-
-ccl_device void kernel_scene_intersect(
-        KernelGlobals *kg,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
-	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
-		return;
+	/* Fetch use_queues_flag */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
 
-	/* intersect scene */
-	uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-	float difl = 0.0f, extmax = 0.0f;
-	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
 
-	if(kernel_data.bvh.have_curves) {
-		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
-			float3 pixdiff = ray.dD.dx + ray.dD.dy;
-			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+	/* All regenerated rays become active here */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+		if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+			kernel_split_path_end(kg, ray_index);
+		}
+		else
+#endif  /* __BRANCHED_PATH__ */
+		{
+			ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
 		}
+	}
 
-		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		return;
 	}
 
-	bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
-#else
-	bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
-#endif
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
-#ifdef __KERNEL_DEBUG__
-	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes;
-		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
-		debug_data->num_bvh_intersections += isect->num_intersections;
-	}
-	debug_data->num_ray_bounces++;
-#endif
+	Intersection isect;
+	bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
+	kernel_split_state.isect[ray_index] = isect;
 
 	if(!hit) {
 		/* Change the state of rays that hit the background;
 		 * These rays undergo special processing in the
 		 * background_bufferUpdate kernel.
 		 */
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..2bc2d300699 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,57 +14,53 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                         |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
- * ray_state ------------------------------------------|                         |
- * kg (globals) ---------------------------------------|                         |
- * queuesize ------------------------------------------|                         |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+/* This kernel evaluates ShaderData structure from the values computed
+ * by the previous kernels.
  */
-ccl_device void kernel_shader_eval(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	/* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+	int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+	                          QUEUE_SHADER_SORTED_RAYS,
+#endif
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		shader_setup_from_ray(kg,
-		                      sd,
-		                      isect,
-		                      &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, state->flag);
+#ifdef __BRANCHED_PATH__
+		if(kernel_data.integrator.branched) {
+			shader_merge_closures(kernel_split_sd(sd, ray_index));
+		}
+		else
+#endif
+		{
+			shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
+		}
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..ea3ec2ec83f
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+                                    ccl_local_param unsigned int *local_queue_atomics)
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
+
+		shader_setup_from_ray(kg,
+		                      sd,
+		                      &isect,
+		                      &ray);
+
+#ifdef __VOLUME__
+		if(sd->flag & SD_HAS_ONLY_VOLUME) {
+			ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
+		}
+#endif
+	}
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..2132c42220f
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+                                   ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+	int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(tid == 0) {
+		kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+	}
+
+	uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+	if(offset >= qsize) {
+		return;
+	}
+
+	int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+	uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+	ccl_local uint *local_value = &locals->local_value[0];
+	ccl_local ushort *local_index = &locals->local_index[0];
+
+	/* copy to local memory */
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint add = input + idx;
+		uint value = (~0);
+		if(idx < qsize) {
+			int ray_index = kernel_split_state.queue_data[add];
+			bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+			if(valid) {
+				value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
+			}
+		}
+		local_value[i + lid] = value;
+		local_index[i + lid] = i + lid;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	/* skip sorting for cpu split kernel */
+#  ifdef __KERNEL_OPENCL__
+
+	/* bitonic sort */
+	for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for(uint inc = length; inc > 0; inc >>= 1) {
+			for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+				uint i = lid + ii;
+				bool direction = ((i & (length << 1)) != 0);
+				uint j = i ^ inc;
+				ushort ioff = local_index[i];
+				ushort joff = local_index[j];
+				uint iKey = local_value[ioff];
+				uint jKey = local_value[joff];
+				bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+				bool swap = smaller ^ (j < i) ^ direction;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+				local_index[i] = (swap) ? joff : ioff;
+				local_index[j] = (swap) ? ioff : joff;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+			}
+		}
+	}
+#  endif /* __KERNEL_OPENCL__ */
+
+	/* copy to destination */
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint lidx = local_index[i + lid];
+		uint outi = output + idx;
+		uint ini = input + offset + lidx;
+		uint value = local_value[lidx];
+		if(idx < qsize) {
+			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+		}
+	}
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
deleted file mode 100644
index 6153af47f96..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-ccl_device void kernel_shadow_blocked(
-        KernelGlobals *kg,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        char shadow_blocked_type,
-        int ray_index)
-{
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-
-		ccl_global Ray *light_ray_global =
-		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
-		                ? light_ray_ao_global
-		                : light_ray_dl_global;
-
-		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        kg->sd_input,
-		                                        state,
-		                                        light_ray_global,
-		                                        &shadow));
-
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
-	}
-}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
new file mode 100644
index 00000000000..a4cffd77eff
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for AO. */
+ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
+{
+	unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < ao_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ShaderData *sd = kernel_split_sd(sd, ray_index);
+	ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+		kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
+	}
+#endif
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
new file mode 100644
index 00000000000..da072fd5f1a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for direct visible light. */
+ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
+{
+	unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < dl_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+#ifdef __BRANCHED_PATH__
+	/* TODO(mai): move this somewhere else? */
+	if(thread_index == 0) {
+		/* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+		kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+	}
+#endif  /* __BRANCHED_PATH__ */
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.light_ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *sd = kernel_split_sd(sd, ray_index);
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+	ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+	bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+	bool use_branched = false;
+	int all = 0;
+
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		use_branched = true;
+		all = 1;
+	}
+#    if defined(__BRANCHED_PATH__)
+	else if(kernel_data.integrator.branched) {
+		use_branched = true;
+
+		if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			all = (kernel_data.integrator.sample_all_lights_indirect);
+		}
+		else
+		{
+			all = (kernel_data.integrator.sample_all_lights_direct);
+		}
+	}
+#    endif  /* __BRANCHED_PATH__ */
+
+	if(use_branched) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           all);
+	}
+	else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+	{
+		/* trace shadow ray */
+		float3 shadow;
+
+		if(!shadow_blocked(kg,
+		                   sd,
+		                   emission_sd,
+		                   state,
+		                   &ray,
+		                   &shadow))
+		{
+			/* accumulate */
+			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+		}
+		else {
+			path_radiance_accum_total_light(L, state, throughput, &L_light);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..b52e7bddc82 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,48 +17,83 @@
 #ifndef  __KERNEL_SPLIT_H__
 #define  __KERNEL_SPLIT_H__
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
-
-#include "util_atomic.h"
-
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
-
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/split/kernel_split_data.h"
+
+#include "kernel/kernel_globals.h"
+
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-#ifdef __VOLUME__
-#include "kernel_volume.h"
+#ifdef __KERNEL_OPENCL__
+#  include "kernel/kernels/opencl/kernel_opencl_image.h"
+#endif
+#ifdef __KERNEL_CUDA__
+#  include "kernel/kernels/cuda/kernel_cuda_image.h"
+#endif
+#ifdef __KERNEL_CPU__
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "util/util_atomic.h"
+
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
+#endif
+
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
+
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+	ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+
+	if(ss_indirect->num_rays) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+		int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(orig_ray_L, L);
+
+		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
 
-#ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+	}
+	else {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+#else
+	ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..9297e1e0ad5
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	uint64_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+	size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+	uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1);
+
+#ifdef __BRANCHED_PATH__
+	size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+#endif
+
+	size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+
+	return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+                                       ccl_global SplitData *split_data,
+                                       size_t num_elements,
+                                       ccl_global void *data,
+                                       ccl_global char *ray_state)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+	SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+	uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1);
+
+#ifdef __BRANCHED_PATH__
+	split_data->_branched_state_sd = (ShaderData*)p;
+	p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+#endif
+
+	split_data->_sd = (ShaderData*)p;
+	p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+
+	split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..56194d9f857
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+	WorkTile tile;
+	uint total_work_size;
+
+	ccl_global unsigned int *work_pools;
+
+	ccl_global int *queue_index;
+	int queue_size;
+	ccl_global char *use_queues_flag;
+
+	/* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+	int dummy_sd_flag;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+	/* various state that must be kept and restored after an indirect loop */
+	PathState path_state;
+	float3 throughput;
+	Ray ray;
+
+	Intersection isect;
+
+	char ray_state;
+
+	/* indirect loop state */
+	int next_closure;
+	int next_sample;
+
+#ifdef __SUBSURFACE__
+	int ss_next_closure;
+	int ss_next_sample;
+	int next_hit;
+	int num_hits;
+
+	uint lcg_state;
+	LocalIntersection ss_isect;
+#endif  /*__SUBSURFACE__ */
+
+	int shared_sample_count; /* number of branched samples shared with other threads */
+	int original_ray; /* index of original ray when sharing branched samples */
+	bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+#  define SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+#  define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
+
+#define SPLIT_DATA_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+	SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
+	SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
+	 */
+	ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+#  define kernel_split_state (kg->split_data)
+#  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */
+
+#define kernel_split_sd(sd, ray_index) ((ShaderData*) \
+	( \
+		((ccl_global char*)kernel_split_state._##sd) + \
+		(sizeof(ShaderData) + sizeof(ShaderClosure)*(kernel_data.integrator.max_closures-1)) * (ray_index) \
+	))
+
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+	uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+	uint queue_atomics_bg;
+	uint queue_atomics_ao;
+} BackgroundAOLocals;
+
+typedef struct ShaderSortLocals {
+	uint local_value[SHADER_SORT_BLOCK_SIZE];
+	ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
new file mode 100644
index 00000000000..af0303d8608
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
+
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+			                                                     0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples * 3;
+		float num_samples_inv = 1.0f/num_samples;
+		uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+			*hit_state = branched_state->path_state;
+			hit_state->rng_hash = bssrdf_rng_hash;
+			path_state_branch(hit_state, j, num_samples);
+
+			ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     bssrdf_rng_hash,
+			                     hit_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				uint lcg_state = branched_state->lcg_state;
+				LocalIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              hit_state,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				LocalIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               hit_state,
+				                               sc);
+				*ss_isect = ss_isect_private;
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
+					}
+
+					kernel_volume_stack_update_for_subsurface(kg,
+					                                          emission_sd,
+					                                          &volume_ray,
+					                                          hit_state->volume_stack);
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	branched_state->ss_next_closure = sd->num_closure;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+
+#ifdef __SUBSURFACE__
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		ShaderData *sd = kernel_split_sd(sd, ray_index);
+		ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
+		if(sd->flag & SD_BSSRDF) {
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched ||
+			   IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+			{
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+				                                  sd,
+				                                  emission_sd,
+				                                  L,
+				                                  state,
+				                                  ray,
+				                                  throughput,
+				                                  ss_indirect))
+				{
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#endif
+		}
+	}
+
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
+	}
+
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) =
-				        (start_sample == 0 && sample_iterator == 0)
-				                ? *(per_sample_output_buffer + pass_stride_iterator)
-				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..39cd5da7b12 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -30,8 +30,7 @@
  * in local memory on the GPU, as it would take too many register and indexes in
  * ways not known at compile time. This seems the only solution even though it
  * may be slow, with two positive factors. If the same shader is being executed,
- * memory access will be coalesced, and on fermi cards, memory will actually be
- * cached.
+ * memory access will be coalesced and cached.
  *
  * The result of shader execution will be a single closure. This means the
  * closure type, associated label, data and weight. Sampling from multiple
@@ -39,7 +38,7 @@
  * mostly taken care of in the SVM compiler.
  */
 
-#include "svm_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -139,49 +138,53 @@ CCL_NAMESPACE_END
 
 /* Nodes */
 
-#include "svm_noise.h"
+#include "kernel/svm/svm_noise.h"
 #include "svm_texture.h"
 
-#include "svm_color_util.h"
-#include "svm_math_util.h"
-
-#include "svm_attribute.h"
-#include "svm_gradient.h"
-#include "svm_blackbody.h"
-#include "svm_closure.h"
-#include "svm_noisetex.h"
-#include "svm_convert.h"
-#include "svm_displace.h"
-#include "svm_fresnel.h"
-#include "svm_wireframe.h"
-#include "svm_wavelength.h"
-#include "svm_camera.h"
-#include "svm_geometry.h"
-#include "svm_hsv.h"
-#include "svm_image.h"
-#include "svm_gamma.h"
-#include "svm_brightness.h"
-#include "svm_invert.h"
-#include "svm_light_path.h"
-#include "svm_magic.h"
-#include "svm_mapping.h"
-#include "svm_normal.h"
-#include "svm_wave.h"
-#include "svm_math.h"
-#include "svm_mix.h"
-#include "svm_ramp.h"
-#include "svm_sepcomb_hsv.h"
-#include "svm_sepcomb_vector.h"
-#include "svm_musgrave.h"
-#include "svm_sky.h"
-#include "svm_tex_coord.h"
-#include "svm_value.h"
-#include "svm_voronoi.h"
-#include "svm_checker.h"
-#include "svm_brick.h"
-#include "svm_vector_transform.h"
-#include "svm_voxel.h"
-#include "svm_bump.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_math_util.h"
+
+#include "kernel/svm/svm_attribute.h"
+#include "kernel/svm/svm_gradient.h"
+#include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_closure.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_convert.h"
+#include "kernel/svm/svm_displace.h"
+#include "kernel/svm/svm_fresnel.h"
+#include "kernel/svm/svm_wireframe.h"
+#include "kernel/svm/svm_wavelength.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_hsv.h"
+#include "kernel/svm/svm_image.h"
+#include "kernel/svm/svm_gamma.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_invert.h"
+#include "kernel/svm/svm_light_path.h"
+#include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_mapping.h"
+#include "kernel/svm/svm_normal.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_math.h"
+#include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_ramp.h"
+#include "kernel/svm/svm_sepcomb_hsv.h"
+#include "kernel/svm/svm_sepcomb_vector.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_sky.h"
+#include "kernel/svm/svm_tex_coord.h"
+#include "kernel/svm/svm_value.h"
+#include "kernel/svm/svm_voronoi.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_voxel.h"
+#include "kernel/svm/svm_bump.h"
+
+#ifdef __SHADER_RAYTRACE__
+#  include "kernel/svm/svm_bevel.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -192,7 +195,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
@@ -207,7 +210,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 				break;
 			}
 			case NODE_CLOSURE_BSDF:
-				svm_node_closure_bsdf(kg, sd, stack, node, path_flag, &offset);
+				svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
 				break;
 			case NODE_CLOSURE_EMISSION:
 				svm_node_closure_emission(sd, stack, node);
@@ -263,6 +266,12 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 			case NODE_SET_DISPLACEMENT:
 				svm_node_set_displacement(kg, sd, stack, node.y);
 				break;
+			case NODE_DISPLACEMENT:
+				svm_node_displacement(kg, sd, stack, node);
+				break;
+			case NODE_VECTOR_DISPLACEMENT:
+				svm_node_vector_displacement(kg, sd, stack, node, &offset);
+				break;
 #  endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
 #  ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
@@ -325,7 +334,10 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 				break;
 #  if NODES_FEATURE(NODE_FEATURE_VOLUME)
 			case NODE_CLOSURE_VOLUME:
-				svm_node_closure_volume(kg, sd, stack, node, path_flag);
+				svm_node_closure_volume(kg, sd, stack, node, type);
+				break;
+			case NODE_PRINCIPLED_VOLUME:
+				svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
 				break;
 #  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
 #  ifdef __EXTRA_NODES__
@@ -460,6 +472,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 				svm_node_tex_voxel(kg, sd, stack, node, &offset);
 				break;
 #  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+#  ifdef __SHADER_RAYTRACE__
+			case NODE_BEVEL:
+				svm_node_bevel(kg, sd, state, stack, node);
+				break;
+#  endif  /* __SHADER_RAYTRACE__ */
 #endif  /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
 			case NODE_END:
 				return;
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
 
 	AttributeDescriptor desc;
 
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+	if(sd->object != OBJECT_NONE) {
 		desc = find_attribute(kg, sd, node.y);
 		if(desc.offset == ATTR_STD_NOT_FOUND) {
 			desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
new file mode 100644
index 00000000000..89f4a98e846
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Bevel shader averaging normals from nearby surfaces.
+ *
+ * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
+ * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
+ */
+
+ccl_device_noinline float3 svm_bevel(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	ccl_addr_space PathState *state,
+	float radius,
+	int num_samples)
+{
+	/* Early out if no sampling needed. */
+	if(radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
+		return sd->N;
+	}
+
+	/* Don't bevel for blurry indirect rays. */
+	if(state->min_ray_pdf < 8.0f) {
+		return sd->N;
+	}
+
+	/* Setup for multi intersection. */
+	LocalIntersection isect;
+	uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+
+	/* Sample normals from surrounding points on surface. */
+	float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int sample = 0; sample < num_samples; sample++) {
+		float disk_u, disk_v;
+		path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples,
+		                     PRNG_BEVEL_U, &disk_u, &disk_v);
+
+		/* Pick random axis in local frame and point on disk. */
+		float3 disk_N, disk_T, disk_B;
+		float pick_pdf_N, pick_pdf_T, pick_pdf_B;
+
+		disk_N = sd->Ng;
+		make_orthonormals(disk_N, &disk_T, &disk_B);
+
+		float axisu = disk_u;
+
+		if(axisu < 0.5f) {
+			pick_pdf_N = 0.5f;
+			pick_pdf_T = 0.25f;
+			pick_pdf_B = 0.25f;
+			disk_u *= 2.0f;
+		}
+		else if(axisu < 0.75f) {
+			float3 tmp = disk_N;
+			disk_N = disk_T;
+			disk_T = tmp;
+			pick_pdf_N = 0.25f;
+			pick_pdf_T = 0.5f;
+			pick_pdf_B = 0.25f;
+			disk_u = (disk_u - 0.5f)*4.0f;
+		}
+		else {
+			float3 tmp = disk_N;
+			disk_N = disk_B;
+			disk_B = tmp;
+			pick_pdf_N = 0.25f;
+			pick_pdf_T = 0.25f;
+			pick_pdf_B = 0.5f;
+			disk_u = (disk_u - 0.75f)*4.0f;
+		}
+
+		/* Sample point on disk. */
+		float phi = M_2PI_F * disk_u;
+		float disk_r = disk_v;
+		float disk_height;
+
+		/* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
+		bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+
+		float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
+
+		/* Create ray. */
+		Ray *ray = &isect.ray;
+		ray->P = sd->P + disk_N*disk_height + disk_P;
+		ray->D = -disk_N;
+		ray->t = 2.0f*disk_height;
+		ray->dP = sd->dP;
+		ray->dD = differential3_zero();
+		ray->time = sd->time;
+
+		/* Intersect with the same object. if multiple intersections are found it
+		 * will use at most LOCAL_MAX_HITS hits, a random subset of all hits. */
+		scene_intersect_local(kg,
+		                      *ray,
+		                      &isect,
+		                      sd->object,
+		                      &lcg_state,
+		                      LOCAL_MAX_HITS);
+
+		int num_eval_hits = min(isect.num_hits, LOCAL_MAX_HITS);
+
+		for(int hit = 0; hit < num_eval_hits; hit++) {
+			/* Quickly retrieve P and Ng without setting up ShaderData. */
+			float3 hit_P;
+			if(sd->type & PRIMITIVE_TRIANGLE) {
+				hit_P = triangle_refine_local(kg,
+				                              sd,
+				                              &isect.hits[hit],
+				                              ray);
+			}
+#ifdef __OBJECT_MOTION__
+			else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
+				float3 verts[3];
+				motion_triangle_vertices(
+					kg,
+					sd->object,
+					kernel_tex_fetch(__prim_index, isect.hits[hit].prim),
+					sd->time,
+					verts);
+				hit_P = motion_triangle_refine_local(kg,
+				                                     sd,
+				                                     &isect.hits[hit],
+				                                     ray,
+				                                     verts);
+			}
+#endif  /* __OBJECT_MOTION__ */
+
+			float3 hit_Ng = isect.Ng[hit];
+
+			/* Compute smooth normal. */
+			float3 N = hit_Ng;
+			int prim = kernel_tex_fetch(__prim_index, isect.hits[hit].prim);
+			int shader = kernel_tex_fetch(__tri_shader, prim);
+
+			if (shader & SHADER_SMOOTH_NORMAL) {
+				float u = isect.hits[hit].u;
+				float v = isect.hits[hit].v;
+
+				if (sd->type & PRIMITIVE_TRIANGLE) {
+					N = triangle_smooth_normal(kg, N, prim, u, v);
+				}
+#ifdef __OBJECT_MOTION__
+				else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
+					N = motion_triangle_smooth_normal(kg, N, sd->object, prim, u, v, sd->time);
+				}
+#endif  /* __OBJECT_MOTION__ */
+			}
+
+			/* Transform normals to world space. */
+			if(isect.hits[hit].object != OBJECT_NONE) {
+				object_normal_transform(kg, sd, &N);
+				object_normal_transform(kg, sd, &hit_Ng);
+			}
+
+			/* Probability densities for local frame axes. */
+			float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
+			float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
+			float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
+
+			/* Multiple importance sample between 3 axes, power heuristic
+			 * found to be slightly better than balance heuristic. pdf_N
+			 * in the MIS weight and denominator cancelled out. */
+			float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
+			if(isect.num_hits > LOCAL_MAX_HITS) {
+				w *= isect.num_hits/(float)LOCAL_MAX_HITS;
+			}
+
+			/* Real distance to sampled point. */
+			float r = len(hit_P - sd->P);
+
+			/* Compute weight. */
+			float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
+			float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+
+			w *= pdf / disk_pdf;
+
+			/* Sum normal and weight. */
+			sum_N += w * N;
+		}
+	}
+
+	/* Normalize. */
+	float3 N = safe_normalize(sum_N);
+	return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
+}
+
+ccl_device void svm_node_bevel(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	ccl_addr_space PathState *state,
+	float *stack,
+	uint4 node)
+{
+	uint num_samples, radius_offset, normal_offset, out_offset;
+	decode_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
+
+	float radius = stack_load_float(stack, radius_offset);
+	float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
+
+	if(stack_valid(normal_offset)) {
+		/* Preserve input normal. */
+		float3 ref_N = stack_load_float3(stack, normal_offset);
+		bevel_N = normalize(ref_N + (bevel_N - sd->N));
+	}
+
+	stack_store_float3(stack, out_offset, bevel_N);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index b750ad87b7f..51590b18505 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -41,8 +41,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	float3 color_rgb = svm_math_blackbody_color(temperature);
 
-	if(stack_valid(col_offset))
-		stack_store_float3(stack, col_offset, color_rgb);
+	stack_store_float3(stack, col_offset, color_rgb);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 14245cf0522..90fa2a99b67 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 /* Brick */
 
-ccl_device_noinline float brick_noise(int n) /* fast integer noise */
+ccl_device_noinline float brick_noise(uint n) /* fast integer noise */
 {
-	int nn;
+	uint nn;
 	n = (n + 1013) & 0x7fffffff;
 	n = (n >> 13) ^ n;
 	nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 0x7fffffff;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* save state */
-	stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
-	stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
-	stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+	stack_store_float3(stack, offset+0, sd->P);
+	stack_store_float3(stack, offset+3, sd->dP.dx);
+	stack_store_float3(stack, offset+6, sd->dP.dy);
 
 	/* set state as if undisplaced */
 	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
 		object_dir_transform(kg, sd, &dPdx);
 		object_dir_transform(kg, sd, &dPdy);
 
-		ccl_fetch(sd, P) = P;
-		ccl_fetch(sd, dP).dx = dPdx;
-		ccl_fetch(sd, dP).dy = dPdy;
+		sd->P = P;
+		sd->dP.dx = dPdx;
+		sd->dP.dy = dPdy;
 	}
 }
 
 ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* restore state */
-	ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
-	ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
-	ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+	sd->P = stack_load_float3(stack, offset+0);
+	sd->dP.dx = stack_load_float3(stack, offset+3);
+	sd->dP.dy = stack_load_float3(stack, offset+6);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, ccl_fetch(sd, P));
+	vector = transform_point(&tfm, sd->P);
 	zdepth = vector.z;
 	distance = len(vector);
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 017d697f9f8..886a1333fa3 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = eta;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+			sd->flag |= bsdf_refraction_setup(bsdf);
 		}
 		else {
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+			sd->flag |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
 		bsdf->alpha_x = roughness;
@@ -50,13 +50,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int *offset)
+ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type, int path_flag, int *offset)
 {
 	uint type, param1_offset, param2_offset;
 
@@ -67,17 +67,359 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	/* note we read this extra node before weight check, so offset is added */
 	uint4 data_node = read_node(kg, offset);
 
-	if(mix_weight == 0.0f)
+	/* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
+	if(mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+		if(type == CLOSURE_BSDF_PRINCIPLED_ID) {
+			/* Read all principled BSDF extra data to get the right offset. */
+			read_node(kg, offset);
+			read_node(kg, offset);
+			read_node(kg, offset);
+			read_node(kg, offset);
+		}
+
 		return;
+	}
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
 
 	switch(type) {
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_ID: {
+			uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
+				sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
+				anisotropic_rotation_offset, transmission_roughness_offset;
+			uint4 data_node2 = read_node(kg, offset);
+
+			float3 T = stack_load_float3(stack, data_node.y);
+			decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
+			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
+			decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
+
+			// get Disney principled parameters
+			float metallic = param1;
+			float subsurface = param2;
+			float specular = stack_load_float(stack, specular_offset);
+			float roughness = stack_load_float(stack, roughness_offset);
+			float specular_tint = stack_load_float(stack, specular_tint_offset);
+			float anisotropic = stack_load_float(stack, anisotropic_offset);
+			float sheen = stack_load_float(stack, sheen_offset);
+			float sheen_tint = stack_load_float(stack, sheen_tint_offset);
+			float clearcoat = stack_load_float(stack, clearcoat_offset);
+			float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
+			float transmission = stack_load_float(stack, transmission_offset);
+			float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
+			float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
+			float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
+
+			ClosureType distribution = (ClosureType) data_node2.y;
+			ClosureType subsurface_method = (ClosureType) data_node2.z;
+
+			/* rotate tangent */
+			if(anisotropic_rotation != 0.0f)
+				T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
+
+			/* calculate ior */
+			float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
+
+			// calculate fresnel for refraction
+			float cosNO = dot(N, sd->I);
+			float fresnel = fresnel_dielectric_cos(cosNO, ior);
+
+			// calculate weights of the diffuse and specular part
+			float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
+			
+			float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+			float specular_weight = (1.0f - final_transmission);
+
+			// get the base color
+			uint4 data_base_color = read_node(kg, offset);
+			float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) :
+				make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w));
+
+			// get the additional clearcoat normal and subsurface scattering radius
+			uint4 data_cn_ssr = read_node(kg, offset);
+			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
+			float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
+
+			// get the subsurface color
+			uint4 data_subsurface_color = read_node(kg, offset);
+			float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
+				make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
+
+			float3 weight = sd->svm_closure_weight * mix_weight;
+
+#ifdef __SUBSURFACE__
+			float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+			float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+
+			/* disable in case of diffuse ancestor, can't see it well then and
+			 * adds considerably noise due to probabilities of continuing path
+			 * getting lower and lower */
+			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+				subsurface = 0.0f;
+
+				/* need to set the base color in this case such that the
+				 * rays get the correctly mixed color after transmitting
+				 * the object */
+				base_color = mixed_ss_base_color;
+			}
+
+			/* diffuse */
+			if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+				if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+					float3 diff_weight = weight * base_color * diffuse_weight;
+
+					PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+					if(bsdf) {
+						bsdf->N = N;
+						bsdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+					}
+				}
+				else if(subsurface > CLOSURE_WEIGHT_CUTOFF) {
+					Bssrdf *bssrdf = bssrdf_alloc(sd, subsurf_weight);
+
+					if(bssrdf) {
+						bssrdf->radius = subsurface_radius * subsurface;
+						bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID)? subsurface_color:  mixed_ss_base_color;
+						bssrdf->texture_blur = 0.0f;
+						bssrdf->sharpness = 0.0f;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+					}
+				}
+			}
+#else
+			/* diffuse */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+				float3 diff_weight = weight * base_color * diffuse_weight;
+
+				PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+					bsdf->roughness = roughness;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+				}
+			}
+#endif
+
+			/* sheen */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
+				float m_cdlum = linear_rgb_to_gray(base_color);
+				float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat
+
+				/* color of the sheen component */
+				float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint;
+
+				float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+
+				PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_sheen_setup(bsdf);
+				}
+			}
+
+			/* specular reflection */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
+					float3 spec_weight = weight * specular_weight;
+
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight);
+					if(!bsdf){
+						break;
+					}
+
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+					if(!extra) {
+						break;
+					}
+
+					bsdf->N = N;
+					bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
+					bsdf->T = T;
+					bsdf->extra = extra;
+
+					float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
+					float r2 = roughness * roughness;
+
+					bsdf->alpha_x = r2 / aspect;
+					bsdf->alpha_y = r2 * aspect;
+
+					float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
+					float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
+					float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint;
+
+					bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic;
+					bsdf->extra->color = base_color;
+					bsdf->extra->clearcoat = 0.0f;
+
+					/* setup bsdf */
+					if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
+						sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+					else /* use multi-scatter GGX */
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* BSDF */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+					float3 glass_weight = weight * final_transmission;
+					float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+
+					if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
+						float refl_roughness = roughness;
+
+						/* reflection */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel);
+							if(!bsdf) {
+								break;
+							}
+
+							MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+							if(!extra) {
+								break;
+							}
+
+							bsdf->N = N;
+							bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+							bsdf->extra = extra;
+
+							bsdf->alpha_x = refl_roughness * refl_roughness;
+							bsdf->alpha_y = refl_roughness * refl_roughness;
+							bsdf->ior = ior;
+
+							bsdf->extra->color = base_color;
+							bsdf->extra->cspec0 = cspec0;
+							bsdf->extra->clearcoat = 0.0f;
+
+							/* setup bsdf */
+							sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+						}
+
+						/* refraction */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel));
+							if(!bsdf) {
+								break;
+							}
+
+							bsdf->N = N;
+							bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+							bsdf->extra = NULL;
+
+							if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
+								transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
+							else
+								transmission_roughness = refl_roughness;
+
+							bsdf->alpha_x = transmission_roughness * transmission_roughness;
+							bsdf->alpha_y = transmission_roughness * transmission_roughness;
+							bsdf->ior = ior;
+
+							/* setup bsdf */
+							sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						}
+					}
+					else { /* use multi-scatter GGX */
+						MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight);
+						if(!bsdf) {
+							break;
+						}
+
+						MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+						if(!extra) {
+							break;
+						}
+
+						bsdf->N = N;
+						bsdf->extra = extra;
+						bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+
+						bsdf->alpha_x = roughness * roughness;
+						bsdf->alpha_y = roughness * roughness;
+						bsdf->ior = ior;
+
+						bsdf->extra->color = base_color;
+						bsdf->extra->cspec0 = cspec0;
+						bsdf->extra->clearcoat = 0.0f;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* clearcoat */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(clearcoat > CLOSURE_WEIGHT_CUTOFF) {
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+					if(!bsdf) {
+						break;
+					}
+
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+					if(!extra) {
+						break;
+					}
+
+					bsdf->N = clearcoat_normal;
+					bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+					bsdf->ior = 1.5f;
+					bsdf->extra = extra;
+
+					bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+					bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+
+					bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f);
+					bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+					bsdf->extra->clearcoat = clearcoat;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			break;
+		}
+#endif  /* __PRINCIPLED__ */
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
 			if(bsdf) {
@@ -86,32 +428,28 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+					sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
 					bsdf->roughness = roughness;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+					sd->flag |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+				sd->flag |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
-			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
-
-			if(bsdf) {
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
-			}
+			float3 weight = sd->svm_closure_weight * mix_weight;
+			bsdf_transparent_setup(sd, weight, path_flag);
 			break;
 		}
 		case CLOSURE_BSDF_REFLECTION_ID:
@@ -123,33 +461,41 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
-			if(bsdf) {
-				bsdf->N = N;
-				bsdf->alpha_x = param1;
-				bsdf->alpha_y = param1;
-				bsdf->ior = 0.0f;
-				bsdf->extra = NULL;
+			if(!bsdf) {
+				break;
+			}
 
-				/* setup bsdf */
-				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
-				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
-				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
-				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
-					kernel_assert(stack_valid(data_node.z));
-					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-					if(bsdf->extra) {
-						bsdf->extra->color = stack_load_float3(stack, data_node.z);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
-					}
+			float roughness = sqr(param1);
+
+			bsdf->N = N;
+			bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+			bsdf->alpha_x = roughness;
+			bsdf->alpha_y = roughness;
+			bsdf->ior = 0.0f;
+			bsdf->extra = NULL;
+
+			/* setup bsdf */
+			if(type == CLOSURE_BSDF_REFLECTION_ID)
+				sd->flag |= bsdf_reflection_setup(bsdf);
+			else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
+				sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
+			else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
+				sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
+			else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
+				kernel_assert(stack_valid(data_node.z));
+				bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+				if(bsdf->extra) {
+					bsdf->extra->color = stack_load_float3(stack, data_node.z);
+					bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+					bsdf->extra->clearcoat = 0.0f;
+					sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 				}
-				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+			}
+			else {
+				sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -161,15 +507,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
+				bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -177,17 +524,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->alpha_y = 0.0f;
 					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+					sd->flag |= bsdf_refraction_setup(bsdf);
 				}
 				else {
-					bsdf->alpha_x = param1;
-					bsdf->alpha_y = param1;
+					float roughness = sqr(param1);
+					bsdf->alpha_x = roughness;
+					bsdf->alpha_y = roughness;
 					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -203,16 +551,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
-			float roughness = param1;
+			float roughness = sqr(param1);
 
 			/* reflection */
 #ifdef __CAUSTICS_TRICKS__
@@ -223,6 +571,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				if(bsdf) {
 					bsdf->N = N;
+					bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 					bsdf->extra = NULL;
 					svm_node_glass_setup(sd, bsdf, type, eta, roughness, false);
 				}
@@ -237,6 +586,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				if(bsdf) {
 					bsdf->N = N;
+					bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 					bsdf->extra = NULL;
 					svm_node_glass_setup(sd, bsdf, type, eta, roughness, true);
 				}
@@ -249,27 +599,34 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
-			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(!bsdf) {
+				break;
+			}
 
-			if(bsdf && extra) {
-				bsdf->N = N;
-				bsdf->extra = extra;
-				bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(!extra) {
+				break;
+			}
 
-				bsdf->alpha_x = param1;
-				bsdf->alpha_y = param1;
-				float eta = fmaxf(param2, 1e-5f);
-				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			bsdf->N = N;
+			bsdf->extra = extra;
+			bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 
-				kernel_assert(stack_valid(data_node.z));
-				bsdf->extra->color = stack_load_float3(stack, data_node.z);
+			float roughness = sqr(param1);
+			bsdf->alpha_x = roughness;
+			bsdf->alpha_y = roughness;
+			float eta = fmaxf(param2, 1e-5f);
+			bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
-			}
+			kernel_assert(stack_valid(data_node.z));
+			bsdf->extra->color = stack_load_float3(stack, data_node.z);
+			bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+			bsdf->extra->clearcoat = 0.0f;
 
+			/* setup bsdf */
+			sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			break;
 		}
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
@@ -280,7 +637,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -295,7 +652,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->T = rotate_around_axis(bsdf->T, bsdf->N, rotation * M_2PI_F);
 
 				/* compute roughness */
-				float roughness = param1;
+				float roughness = sqr(param1);
 				float anisotropy = clamp(param2, -0.99f, 0.99f);
 
 				if(anisotropy < 0.0f) {
@@ -310,33 +667,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->ior = 0.0f;
 
 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.w);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+						bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+						bsdf->extra->clearcoat = 0.0f;
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
 
 				bsdf->sigma = saturate(param1);
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+				sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -344,9 +703,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __CAUSTICS_TRICKS__
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
+			ATTR_FALLTHROUGH;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
 			if(bsdf) {
@@ -355,34 +715,30 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+					sd->flag |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+					sd->flag |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			
-			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
-
-				if(bsdf) {
-					/* todo: giving a fixed weight here will cause issues when
-					 * mixing multiple BSDFS. energy will not be conserved and
-					 * the throughput can blow up after multiple bounces. we
-					 * better figure out a way to skip backfaces from rays
-					 * spawned by transmission from the front */
-					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
-				}
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+				/* todo: giving a fixed weight here will cause issues when
+				 * mixing multiple BSDFS. energy will not be conserved and
+				 * the throughput can blow up after multiple bounces. we
+				 * better figure out a way to skip backfaces from rays
+				 * spawned by transmission from the front */
+				bsdf_transparent_setup(sd, make_float3(1.0f, 1.0f, 1.0f), path_flag);
 			}
 			else {
 				HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					bsdf->roughness1 = param1;
 					bsdf->roughness2 = param2;
 					bsdf->offset = -stack_load_float(stack, data_node.z);
@@ -390,18 +746,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					if(stack_valid(data_node.y)) {
 						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
-					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+					else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+						bsdf->T = normalize(sd->dPdv);
 						bsdf->offset = 0.0f;
 					}
 					else
-						bsdf->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(sd->dPdu);
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+						sd->flag |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+						sd->flag |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -413,58 +769,25 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __SUBSURFACE__
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
-		case CLOSURE_BSSRDF_BURLEY_ID: {
-			float3 albedo = ccl_fetch(sd, svm_closure_weight);
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
-			float sample_weight = fabsf(average(weight));
-			
-			/* disable in case of diffuse ancestor, can't see it well then and
-			 * adds considerably noise due to probabilities of continuing path
-			 * getting lower and lower */
-			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
-				param1 = 0.0f;
-
-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF) {
-				/* radius * scale */
-				float3 radius = stack_load_float3(stack, data_node.z)*param1;
-				/* sharpness */
-				float sharpness = stack_load_float(stack, data_node.w);
-				/* texture color blur */
-				float texture_blur = param2;
-
-				/* create one closure per color channel */
-				Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f));
-				if(bssrdf) {
-					bssrdf->sample_weight = sample_weight;
-					bssrdf->radius = radius.x;
-					bssrdf->texture_blur = texture_blur;
-					bssrdf->albedo = albedo.x;
-					bssrdf->sharpness = sharpness;
-					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-				}
-
-				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
-				if(bssrdf) {
-					bssrdf->sample_weight = sample_weight;
-					bssrdf->radius = radius.y;
-					bssrdf->texture_blur = texture_blur;
-					bssrdf->albedo = albedo.y;
-					bssrdf->sharpness = sharpness;
-					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-				}
-
-				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
-				if(bssrdf) {
-					bssrdf->sample_weight = sample_weight;
-					bssrdf->radius = radius.z;
-					bssrdf->texture_blur = texture_blur;
-					bssrdf->albedo = albedo.z;
-					bssrdf->sharpness = sharpness;
-					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
-				}
+		case CLOSURE_BSSRDF_BURLEY_ID:
+		case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+			float3 weight = sd->svm_closure_weight * mix_weight;
+			Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
+
+			if(bssrdf) {
+				/* disable in case of diffuse ancestor, can't see it well then and
+				 * adds considerably noise due to probabilities of continuing path
+				 * getting lower and lower */
+				if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
+					param1 = 0.0f;
+
+				bssrdf->radius = stack_load_float3(stack, data_node.z)*param1;
+				bssrdf->albedo = sd->svm_closure_weight;
+				bssrdf->texture_blur = param2;
+				bssrdf->sharpness = stack_load_float(stack, data_node.w);
+				bssrdf->N = N;
+				bssrdf->roughness = 0.0f;
+				sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
 			}
 
 			break;
@@ -475,44 +798,148 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	}
 }
 
-ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag)
+ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
 {
 #ifdef __VOLUME__
-	uint type, param1_offset, param2_offset;
+	/* Only sum extinction for volumes, variable is shared with surface transparency. */
+	if(shader_type != SHADER_TYPE_VOLUME) {
+		return;
+	}
+
+	uint type, density_offset, anisotropy_offset;
 
 	uint mix_weight_offset;
-	decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, &mix_weight_offset);
+	decode_node_uchar4(node.y, &type, &density_offset, &anisotropy_offset, &mix_weight_offset);
 	float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f);
 
-	if(mix_weight == 0.0f)
+	if(mix_weight == 0.0f) {
 		return;
+	}
 
-	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
-	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
-	float density = fmaxf(param1, 0.0f);
+	float density = (stack_valid(density_offset))? stack_load_float(stack, density_offset): __uint_as_float(node.z);
+	density = mix_weight * fmaxf(density, 0.0f);
 
-	switch(type) {
-		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
-			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
+	/* Compute scattering coefficient. */
+	float3 weight = sd->svm_closure_weight;
 
-			if(sc) {
-				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
-			}
-			break;
+	if(type == CLOSURE_VOLUME_ABSORPTION_ID) {
+		weight = make_float3(1.0f, 1.0f, 1.0f) - weight;
+	}
+
+	weight *= density;
+
+	/* Add closure for volume scattering. */
+	if(type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+		HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
+
+		if(volume) {
+			float anisotropy = (stack_valid(anisotropy_offset))? stack_load_float(stack, anisotropy_offset): __uint_as_float(node.w);
+			volume->g = anisotropy; /* g */
+			sd->flag |= volume_henyey_greenstein_setup(volume);
 		}
-		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
-			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
+	}
 
-			if(volume) {
-				volume->g = param2; /* g */
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
-			}
-			break;
+	/* Sum total extinction weight. */
+	volume_extinction_setup(sd, weight);
+#endif
+}
+
+ccl_device void svm_node_principled_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type, int path_flag, int *offset)
+{
+#ifdef __VOLUME__
+	uint4 value_node = read_node(kg, offset);
+	uint4 attr_node = read_node(kg, offset);
+
+	/* Only sum extinction for volumes, variable is shared with surface transparency. */
+	if(shader_type != SHADER_TYPE_VOLUME) {
+		return;
+	}
+
+	uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
+	decode_node_uchar4(node.y, &density_offset, &anisotropy_offset, &absorption_color_offset, &mix_weight_offset);
+	float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f);
+
+	if(mix_weight == 0.0f) {
+		return;
+	}
+
+	/* Compute density. */
+	float primitive_density = 1.0f;
+	float density = (stack_valid(density_offset))? stack_load_float(stack, density_offset): __uint_as_float(value_node.x);
+	density = mix_weight * fmaxf(density, 0.0f);
+
+	if(density > CLOSURE_WEIGHT_CUTOFF) {
+		/* Density and color attribute lookup if available. */
+		const AttributeDescriptor attr_density = find_attribute(kg, sd, attr_node.x);
+		if(attr_density.offset != ATTR_STD_NOT_FOUND) {
+			primitive_density = primitive_attribute_float(kg, sd, attr_density, NULL, NULL);
+			density = fmaxf(density * primitive_density, 0.0f);
+		}
+	}
+
+	if(density > CLOSURE_WEIGHT_CUTOFF) {
+		/* Compute scattering color. */
+		float3 color = sd->svm_closure_weight;
+
+		const AttributeDescriptor attr_color = find_attribute(kg, sd, attr_node.y);
+		if(attr_color.offset != ATTR_STD_NOT_FOUND) {
+			color *= primitive_attribute_float3(kg, sd, attr_color, NULL, NULL);
+		}
+
+		/* Add closure for volume scattering. */
+		HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), color * density);
+		if(volume) {
+			float anisotropy = (stack_valid(anisotropy_offset))? stack_load_float(stack, anisotropy_offset): __uint_as_float(value_node.y);
+			volume->g = anisotropy;
+			sd->flag |= volume_henyey_greenstein_setup(volume);
+		}
+
+		/* Add extinction weight. */
+		float3 zero = make_float3(0.0f, 0.0f, 0.0f);
+		float3 one = make_float3(1.0f, 1.0f, 1.0f);
+		float3 absorption_color = max(sqrt(stack_load_float3(stack, absorption_color_offset)), zero);
+		float3 absorption = max(one - color, zero) * max(one - absorption_color, zero);
+		volume_extinction_setup(sd, (color + absorption) * density);
+	}
+
+	/* Compute emission. */
+	if(path_flag & PATH_RAY_SHADOW) {
+		/* Don't need emission for shadows. */
+		return;
+	}
+
+	uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
+	decode_node_uchar4(node.z, &emission_offset, &emission_color_offset, &blackbody_offset, &temperature_offset);
+	float emission = (stack_valid(emission_offset))? stack_load_float(stack, emission_offset): __uint_as_float(value_node.z);
+	float blackbody = (stack_valid(blackbody_offset))? stack_load_float(stack, blackbody_offset): __uint_as_float(value_node.w);
+
+	if(emission > CLOSURE_WEIGHT_CUTOFF) {
+		float3 emission_color = stack_load_float3(stack, emission_color_offset);
+		emission_setup(sd, emission * emission_color);
+	}
+
+	if(blackbody > CLOSURE_WEIGHT_CUTOFF) {
+		float T = stack_load_float(stack, temperature_offset);
+
+		/* Add flame temperature from attribute if available. */
+		const AttributeDescriptor attr_temperature = find_attribute(kg, sd, attr_node.z);
+		if(attr_temperature.offset != ATTR_STD_NOT_FOUND) {
+			float temperature = primitive_attribute_float(kg, sd, attr_temperature, NULL, NULL);
+			T *= fmaxf(temperature, 0.0f);
+		}
+
+		T = fmaxf(T, 0.0f);
+
+		/* Stefan-Boltzmann law. */
+		float T4 = sqr(sqr(T));
+		float sigma = 5.670373e-8f * 1e-6f / M_PI_F;
+		float intensity = sigma * mix(1.0f, T4, blackbody);
+
+		if(intensity > CLOSURE_WEIGHT_CUTOFF) {
+			float3 blackbody_tint = stack_load_float3(stack, node.w);
+			float3 bb = blackbody_tint * intensity * svm_math_blackbody_color(T);
+			emission_setup(sd, bb);
 		}
-		default:
-			break;
 	}
 #endif
 }
@@ -520,6 +947,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
 	uint mix_weight_offset = node.y;
+	float3 weight = sd->svm_closure_weight;
 
 	if(stack_valid(mix_weight_offset)) {
 		float mix_weight = stack_load_float(stack, mix_weight_offset);
@@ -527,17 +955,16 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		weight *= mix_weight;
 	}
-	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
 
-	ccl_fetch(sd, flag) |= SD_EMISSION;
+	emission_setup(sd, weight);
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
 	uint mix_weight_offset = node.y;
+	float3 weight = sd->svm_closure_weight;
 
 	if(stack_valid(mix_weight_offset)) {
 		float mix_weight = stack_load_float(stack, mix_weight_offset);
@@ -545,10 +972,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		weight *= mix_weight;
 	}
-	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+
+	background_setup(sd, weight);
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -561,12 +988,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_HOLDOUT;
+	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -579,19 +1006,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_AO;
+	sd->flag |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	ccl_fetch(sd, svm_closure_weight) = weight;
+	sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -641,7 +1068,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	ccl_fetch(sd, N) = normal;
+	sd->N = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..533b7f065e6 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -22,13 +22,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 {
 #ifdef __RAY_DIFFERENTIALS__
 	/* get normal input */
-	uint normal_offset, distance_offset, invert, use_object_space;
-	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
+	uint normal_offset, scale_offset, invert, use_object_space;
+	decode_node_uchar4(node.y, &normal_offset, &scale_offset, &invert, &use_object_space);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 
-	float3 dPdx = ccl_fetch(sd, dP).dx;
-	float3 dPdy = ccl_fetch(sd, dP).dy;
+	float3 dPdx = sd->dP.dx;
+	float3 dPdy = sd->dP.dy;
 
 	if(use_object_space) {
 		object_inverse_normal_transform(kg, sd, &normal_in);
@@ -55,16 +55,21 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	float absdet = fabsf(det);
 
 	float strength = stack_load_float(stack, strength_offset);
-	float distance = stack_load_float(stack, distance_offset);
+	float scale = stack_load_float(stack, scale_offset);
 
 	if(invert)
-		distance *= -1.0f;
+		scale *= -1.0f;
 
 	strength = max(strength, 0.0f);
 
 	/* compute and output perturbed normal */
-	float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
-	normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	float3 normal_out = safe_normalize(absdet*normal_in - scale*signf(det)*surfgrad);
+	if(is_zero(normal_out)) {
+		normal_out = normal_in;
+	}
+	else {
+		normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	}
 
 	if(use_object_space) {
 		object_normal_transform(kg, sd, &normal_out);
@@ -78,16 +83,80 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
 ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint fac_offset)
 {
-	float d = stack_load_float(stack, fac_offset);
+	float3 dP = stack_load_float3(stack, fac_offset);
+	sd->P += dP;
+}
 
-	float3 dP = ccl_fetch(sd, N);
-	object_inverse_normal_transform(kg, sd, &dP);
+ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+{
+	uint height_offset, midlevel_offset, scale_offset, normal_offset;
+	decode_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
+
+	float height = stack_load_float(stack, height_offset);
+	float midlevel = stack_load_float(stack, midlevel_offset);
+	float scale = stack_load_float(stack, scale_offset);
+	float3 normal = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	uint space = node.w;
+
+	float3 dP = normal;
+
+	if(space == NODE_NORMAL_MAP_OBJECT) {
+		/* Object space. */
+		object_inverse_normal_transform(kg, sd, &dP);
+		dP *= (height - midlevel) * scale;
+		object_dir_transform(kg, sd, &dP);
+	}
+	else {
+		/* World space. */
+		dP *= (height - midlevel) * scale;
+	}
 
-	dP *= d*0.1f; /* todo: get rid of this factor */
+	stack_store_float3(stack, node.z, dP);
+}
 
-	object_dir_transform(kg, sd, &dP);
+ccl_device void svm_node_vector_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+{
+	uint4 data_node = read_node(kg, offset);
+	uint space = data_node.x;
+
+	uint vector_offset, midlevel_offset,scale_offset, displacement_offset;
+	decode_node_uchar4(node.y, &vector_offset, &midlevel_offset, &scale_offset, &displacement_offset);
+
+	float3 vector = stack_load_float3(stack, vector_offset);
+	float midlevel = stack_load_float(stack, midlevel_offset);
+	float scale = stack_load_float(stack, scale_offset);
+	float3 dP = (vector - make_float3(midlevel, midlevel, midlevel)) * scale;
+
+	if(space == NODE_NORMAL_MAP_TANGENT) {
+		/* Tangent space. */
+		float3 normal = sd->N;
+		object_inverse_normal_transform(kg, sd, &normal);
+
+		const AttributeDescriptor attr = find_attribute(kg, sd, node.z);
+		float3 tangent;
+		if(attr.offset != ATTR_STD_NOT_FOUND) {
+			tangent = primitive_attribute_float3(kg, sd, attr, NULL, NULL);
+		}
+		else {
+			tangent = normalize(sd->dPdu);
+		}
+
+		float3 bitangent = normalize(cross(normal, tangent));
+		const AttributeDescriptor attr_sign = find_attribute(kg, sd, node.w);
+		if(attr_sign.offset != ATTR_STD_NOT_FOUND) {
+			float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
+			bitangent *= sign;
+		}
+
+		dP = tangent*dP.x + normal*dP.y + bitangent*dP.z;
+	}
+
+	if(space != NODE_NORMAL_MAP_WORLD) {
+		/* Tangent or object space. */
+		object_dir_transform(kg, sd, &dP);
+	}
 
-	ccl_fetch(sd, P) += dP;
+	stack_store_float3(stack, displacement_offset, dP);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+		f = fabsf(dot(sd->I, normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..81308d6f12b 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,16 +27,17 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
-		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+		case NODE_GEOM_P: data = sd->P; break;
+		case NODE_GEOM_N: data = sd->N; break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
-		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+		case NODE_GEOM_I: data = sd->I; break;
+		case NODE_GEOM_Ng: data = sd->Ng; break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
+		default: data = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	stack_store_float3(stack, out_offset, data);
@@ -48,8 +49,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -65,8 +66,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -87,9 +88,17 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_RANDOM: {
+			if(sd->lamp != LAMP_NONE) {
+				data = lamp_random_number(kg, sd->lamp);
+			}
+			else {
+				data = object_random_number(kg, sd->object);
+			}
+			break;
+		}
 		default: data = 0.0f; break;
 	}
 
@@ -106,44 +115,50 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
+		case NODE_INFO_PAR_RANDOM: {
+			int particle_id = object_particle_id(kg, sd->object);
+			float random = hash_int_01(particle_index(kg, particle_id));
+			stack_store_float(stack, out_offset, random);
+			break;
+		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -165,19 +180,21 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
 		case NODE_INFO_CURVE_INTERCEPT:
 			break; /* handled as attribute */
+		case NODE_INFO_CURVE_RANDOM:
+			break; /* handled as attribute */
 		case NODE_INFO_CURVE_THICKNESS: {
 			data = curve_thickness(kg, sd);
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = ccl_fetch(sd, curve_transparency);
+			data = sd->curve_transparency;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 53d7b4f812c..74e36e70427 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -46,7 +46,10 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
 		return atan2f(y, x) / M_2PI_F + 0.5f;
 	}
 	else {
-		float r = fmaxf(1.0f - sqrtf(x*x + y*y + z*z), 0.0f);
+		/* Bias a little bit for the case where p is a unit length vector,
+		 * to get exactly zero instead of a small random value depending
+		 * on float precision. */
+		float r = fmaxf(0.999999f - sqrtf(x*x + y*y + z*z), 0.0f);
 
 		if(type == NODE_BLEND_QUADRATIC_SPHERE)
 			return r*r;
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 0d6efb47223..4226e7adfe0 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,185 +16,25 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-#  define TEX_NUM_FLOAT4_IMAGES		TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-#  if __CUDA_ARCH__ < 300
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA
-#  else
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA_KEPLER
-#  endif
-#else
-#  define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_OPENCL
-#endif
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
-#ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-	ssef r_ssef;
-	float4 &r = (float4 &)r_ssef;
-	r = kernel_tex_image_interp(id, x, y);
-#  else
-	float4 r = kernel_tex_image_interp(id, x, y);
-#  endif
-#elif defined(__KERNEL_OPENCL__)
 	float4 r = kernel_tex_image_interp(kg, id, x, y);
-#else
-	float4 r;
-
-#  if __CUDA_ARCH__ < 300
-	/* not particularly proud of this massive switch, what are the
-	 * alternatives?
-	 * - use a single big 1D texture, and do our own lookup/filtering
-	 * - group by size and use a 3d texture, performance impact
-	 * - group into larger texture with some padding for correct lerp
-	 *
-	 * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
-	 * and we cannot use all since we still need some for other storage */
-
-	switch(id) {
-		case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
-		case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
-		case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
-		case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
-		case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
-		case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
-		case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
-		case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
-		case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
-		case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
-		case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
-		case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
-		case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
-		case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
-		case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
-		case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
-		case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
-		case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
-		case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
-		case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
-		case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
-		case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
-		case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
-		case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
-		case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
-		case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
-		case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
-		case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
-		case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
-		case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
-		case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
-		case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
-		case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
-		case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
-		case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
-		case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
-		case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
-		case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
-		case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
-		case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
-		case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
-		case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
-		case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
-		case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
-		case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
-		case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
-		case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
-		case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
-		case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
-		case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
-		case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
-		case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
-		case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
-		case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
-		case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
-		case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
-		case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
-		case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
-		case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
-		case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
-		case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
-		case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
-		case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
-		case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
-		case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
-		case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
-		case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
-		case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
-		case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
-		case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
-		case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
-		case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
-		case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
-		case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
-		case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
-		case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
-		case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
-		case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
-		case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
-		case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
-		case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
-		case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
-		case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
-		case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
-		case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
-		case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
-		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
-		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
-		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
-		default:
-			kernel_assert(0);
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-#  else
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	/* float4, byte4 and half4 */
-	if(id < TEX_START_FLOAT_CUDA_KEPLER)
-		r = kernel_tex_image_interp_float4(tex, x, y);
-	/* float, byte and half */
-	else {
-		float f = kernel_tex_image_interp_float(tex, x, y);
-		r = make_float4(f, f, f, 1.0f);
-	}
-#  endif
-#endif
-
-#ifdef __KERNEL_SSE2__
-	float alpha = r.w;
+	const float alpha = r.w;
 
 	if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
-		r_ssef = r_ssef / ssef(alpha);
-		if(id >= TEX_NUM_FLOAT4_IMAGES)
-			r_ssef = min(r_ssef, ssef(1.0f));
-		r.w = alpha;
-	}
-
-	if(srgb) {
-		r_ssef = color_srgb_to_scene_linear(r_ssef);
-		r.w = alpha;
-	}
-#else
-	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
-		float invw = 1.0f/r.w;
-		r.x *= invw;
-		r.y *= invw;
-		r.z *= invw;
-
-		if(id >= TEX_NUM_FLOAT4_IMAGES) {
-			r.x = min(r.x, 1.0f);
-			r.y = min(r.y, 1.0f);
-			r.z = min(r.z, 1.0f);
+		r /= alpha;
+		const int texture_type = kernel_tex_type(id);
+		if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+		   texture_type == IMAGE_DATA_TYPE_BYTE)
+		{
+			r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
 		}
+		r.w = alpha;
 	}
 
 	if(srgb) {
-		r.x = color_srgb_to_scene_linear(r.x);
-		r.y = color_srgb_to_scene_linear(r.y);
-		r.z = color_srgb_to_scene_linear(r.z);
+		r = color_srgb_to_scene_linear_v4(r);
 	}
-#endif
 
 	return r;
 }
@@ -237,12 +77,14 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = ccl_fetch(sd, N);
+	float3 N = sd->N;
 
-	N = ccl_fetch(sd, N);
+	N = sd->N;
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
+	float3 signed_N = N;
+
 	N.x = fabsf(N.x);
 	N.y = fabsf(N.y);
 	N.z = fabsf(N.z);
@@ -312,12 +154,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 	uint use_alpha = stack_valid(alpha_offset);
 
-	if(weight.x > 0.0f)
-		f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
-	if(weight.y > 0.0f)
-		f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
-	if(weight.z > 0.0f)
-		f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
+	/* Map so that no textures are flipped, rotation is somewhat arbitrary. */
+	if(weight.x > 0.0f) {
+		float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z);
+		f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.y > 0.0f) {
+		float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z);
+		f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.z > 0.0f) {
+		float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x);
+		f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
 
 	if(stack_valid(out_offset))
 		stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -336,8 +185,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
 	float3 co = stack_load_float3(stack, co_offset);
 	float2 uv;
 
-	co = normalize(co);
-	
+	co = safe_normalize(co);
+
 	if(projection == 0)
 		uv = direction_to_equirectangular(co);
 	else
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 04f6f623f18..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
 		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
 		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
@@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+		float squared = sd->ray_length*sd->ray_length;
 		/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
 		if(isfinite(squared)) {
 			strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 0a890545af4..42a7ae9946f 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -26,7 +26,6 @@ ccl_device void svm_node_mapping(KernelGlobals *kg, ShaderData *sd, float *stack
 	tfm.x = read_node_float(kg, offset);
 	tfm.y = read_node_float(kg, offset);
 	tfm.z = read_node_float(kg, offset);
-	tfm.w = read_node_float(kg, offset);
 
 	float3 r = transform_point(&tfm, v);
 	stack_store_float3(stack, out_offset, r);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index a7f15de7325..1ce7777aac3 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -100,71 +100,64 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
 	return Fac;
 }
 
-ccl_device float3 svm_math_blackbody_color(float t) {
-	/* Calculate color in range 800..12000 using an approximation
-	 * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
-	 * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
-	 * which is enough to get the same 8 bit/channel color.
-	 */
-
-	const float rc[6][3] = {
-		{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
-		{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
-		{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
-		{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
-		{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
-		{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
-	};
-
-	const float gc[6][3] = {
-		{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
-		{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
-		{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
-		{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
-		{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
-		{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
-	};
-
-	const float bc[6][4] = {
-		{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
-		{ 0.0f, 0.0f, 0.0f, 0.0f },
-		{ 0.0f, 0.0f, 0.0f, 0.0f },
-		{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
-		{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
-		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
-	};
-
-	int i;
+/* Calculate color in range 800..12000 using an approximation
+ * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
+ * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
+ * which is enough to get the same 8 bit/channel color.
+ */
+
+ccl_static_constant float blackbody_table_r[6][3] = {
+	{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
+	{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
+	{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
+	{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
+	{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
+	{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
+};
+
+ccl_static_constant float blackbody_table_g[6][3] = {
+	{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
+	{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
+	{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
+	{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
+	{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
+	{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
+};
+
+ccl_static_constant float blackbody_table_b[6][4] = {
+	{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
+	{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
+	{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
+};
+
+
+ccl_device float3 svm_math_blackbody_color(float t)
+{
 	if(t >= 12000.0f) {
 		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
 	}
-	else if(t >= 6365.0f) {
-		i = 5;
-	}
-	else if(t >= 3315.0f) {
-		i = 4;
-	}
-	else if(t >= 1902.0f) {
-		i = 3;
-	}
-	else if(t >= 1449.0f) {
-		i = 2;
-	}
-	else if(t >= 1167.0f) {
-		i = 1;
-	}
-	else if(t >= 965.0f) {
-		i = 0;
-	}
-	else {
+	else if(t < 965.0f) {
 		/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
 		return make_float3(4.70366907f, 0.0f, 0.0f);
 	}
 
+	int i = (t >= 6365.0f)? 5:
+		(t >= 3315.0f)? 4:
+		(t >= 1902.0f)? 3:
+		(t >= 1449.0f)? 2:
+		(t >= 1167.0f)? 1: 0;
+
+	ccl_constant float *r = blackbody_table_r[i];
+	ccl_constant float *g = blackbody_table_g[i];
+	ccl_constant float *b = blackbody_table_b[i];
+
 	const float t_inv = 1.0f / t;
-	return make_float3(rc[i][0] * t_inv + rc[i][1] * t + rc[i][2],
-	                   gc[i][0] * t_inv + gc[i][1] * t + gc[i][2],
-	                   ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]);
+	return make_float3(r[0] * t_inv + r[1] * t + r[2],
+	                   g[0] * t_inv + g[1] * t + g[2],
+	                   ((b[0] * t + b[1]) * t + b[2]) * t + b[3]);
 }
 
 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..6ff39e5f587 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -42,53 +42,52 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 				tfm.x = read_node_float(kg, offset);
 				tfm.y = read_node_float(kg, offset);
 				tfm.z = read_node_float(kg, offset);
-				tfm.w = read_node_float(kg, offset);
 				data = transform_point(&tfm, data);
 			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P));
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+				data = camera_world_to_ndc(kg, sd, sd->P);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -112,9 +111,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -123,53 +122,52 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 				tfm.x = read_node_float(kg, offset);
 				tfm.y = read_node_float(kg, offset);
 				tfm.z = read_node_float(kg, offset);
-				tfm.w = read_node_float(kg, offset);
 				data = transform_point(&tfm, data);
 			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -196,9 +194,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -207,53 +205,52 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 				tfm.x = read_node_float(kg, offset);
 				tfm.y = read_node_float(kg, offset);
 				tfm.z = read_node_float(kg, offset);
-				tfm.w = read_node_float(kg, offset);
 				data = transform_point(&tfm, data);
 			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -274,12 +271,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 	float3 color = stack_load_float3(stack, color_offset);
 	color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
 
-	bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+	bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
 	float3 N;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(ccl_fetch(sd, object) == OBJECT_NONE) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -299,11 +296,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
-			normal = ccl_fetch(sd, Ng);
+			normal = sd->Ng;
 
 			/* the normal is already inverted, which is too soon for the math here */
 			if(is_backfacing) {
@@ -345,11 +342,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+		N = safe_normalize(sd->N + (N - sd->N)*strength);
 	}
 
 	if(is_zero(N)) {
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -377,7 +374,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(desc.offset == ATTR_STD_NOT_FOUND)
-			generated = ccl_fetch(sd, P);
+			generated = sd->P;
 		else
 			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
@@ -390,7 +387,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 47209ddfbab..4c3a5975fb8 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -132,6 +132,10 @@ typedef enum ShaderNodeType {
 	NODE_TEX_VOXEL,
 	NODE_ENTER_BUMP_EVAL,
 	NODE_LEAVE_BUMP_EVAL,
+	NODE_BEVEL,
+	NODE_DISPLACEMENT,
+	NODE_VECTOR_DISPLACEMENT,
+	NODE_PRINCIPLED_VOLUME,
 } ShaderNodeType;
 
 typedef enum NodeAttributeType {
@@ -158,6 +162,7 @@ typedef enum NodeObjectInfo {
 
 typedef enum NodeParticleInfo {
 	NODE_INFO_PAR_INDEX,
+	NODE_INFO_PAR_RANDOM,
 	NODE_INFO_PAR_AGE,
 	NODE_INFO_PAR_LIFETIME,
 	NODE_INFO_PAR_LOCATION,
@@ -173,7 +178,8 @@ typedef enum NodeHairInfo {
 	NODE_INFO_CURVE_THICKNESS,
 	/*fade for minimum hair width transpency*/
 	/*NODE_INFO_CURVE_FADE,*/
-	NODE_INFO_CURVE_TANGENT_NORMAL
+	NODE_INFO_CURVE_TANGENT_NORMAL,
+	NODE_INFO_CURVE_RANDOM,
 } NodeHairInfo;
 
 typedef enum NodeLightPath {
@@ -397,17 +403,23 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
+	CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
+	CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
 	/* Glossy */
-	CLOSURE_BSDF_GLOSSY_ID,
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
@@ -416,29 +428,31 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
 	/* Transmission */
-	CLOSURE_BSDF_TRANSMISSION_ID,
 	CLOSURE_BSDF_TRANSLUCENT_ID,
 	CLOSURE_BSDF_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
-	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
 	CLOSURE_BSDF_SHARP_GLASS_ID,
 	CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
 
 	/* Special cases */
 	CLOSURE_BSDF_BSSRDF_ID,
+	CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSDF_TRANSPARENT_ID,
 
 	/* BSSRDF */
 	CLOSURE_BSSRDF_CUBIC_ID,
 	CLOSURE_BSSRDF_GAUSSIAN_ID,
+	CLOSURE_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSSRDF_BURLEY_ID,
+	CLOSURE_BSSRDF_RANDOM_WALK_ID,
+	CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
 
 	/* Other */
-	CLOSURE_EMISSION_ID,
-	CLOSURE_BACKGROUND_ID,
 	CLOSURE_HOLDOUT_ID,
 	CLOSURE_AMBIENT_OCCLUSION_ID,
 
@@ -447,28 +461,38 @@ typedef enum ClosureType {
 	CLOSURE_VOLUME_ABSORPTION_ID,
 	CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID,
 
+	CLOSURE_BSDF_PRINCIPLED_ID,
+
 	NBUILTIN_CLOSURES
 } ClosureType;
 
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
-#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
-#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_SINGULAR(type) (type == CLOSURE_BSDF_REFLECTION_ID || \
+                                        type == CLOSURE_BSDF_REFRACTION_ID || \
+                                        type == CLOSURE_BSDF_TRANSPARENT_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
                                             type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
-											type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
-#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_MICROFACET_GGX_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+                                          (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) ||\
+                                          (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID))
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_DISK_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_EMISSION(type) (type == CLOSURE_EMISSION_ID)
+#define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
+#define CLOSURE_IS_VOLUME_ABSORPTION(type) (type == CLOSURE_VOLUME_ABSORPTION_ID)
 #define CLOSURE_IS_HOLDOUT(type) (type == CLOSURE_HOLDOUT_ID)
-#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index a8b3604a8a7..43b433683e0 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -39,27 +39,10 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 		tfm.x = read_node_float(kg, offset);
 		tfm.y = read_node_float(kg, offset);
 		tfm.z = read_node_float(kg, offset);
-		tfm.w = read_node_float(kg, offset);
 		co = transform_point(&tfm, co);
 	}
-	float4 r;
-#  if defined(__KERNEL_CUDA__)
-#    if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	if(id < 2048) /* TODO(dingto): Make this a variable */
-		r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
-	else {
-		float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
-		r = make_float4(f, f, f, 1.0f);
-	}
-#    else /* __CUDA_ARCH__ >= 300 */
-	r = volume_image_texture_3d(id, co.x, co.y, co.z);
-#    endif
-#  elif defined(__KERNEL_OPENCL__)
-	r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z);
-#  else
-	r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
-#  endif /* __KERNEL_CUDA__ */
+
+	float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE);
 #else
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index 57030f3979d..855b356b397 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -34,44 +34,44 @@ CCL_NAMESPACE_BEGIN
 
 /* Wavelength to RGB */
 
+// CIE colour matching functions xBar, yBar, and zBar for
+//	 wavelengths from 380 through 780 nanometers, every 5
+//	 nanometers.  For a wavelength lambda in this range:
+//		  cie_colour_match[(lambda - 380) / 5][0] = xBar
+//		  cie_colour_match[(lambda - 380) / 5][1] = yBar
+//		  cie_colour_match[(lambda - 380) / 5][2] = zBar
+ccl_static_constant float cie_colour_match[81][3] = {
+	{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
+	{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
+	{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
+	{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
+	{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
+	{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
+	{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
+	{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
+	{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
+	{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
+	{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
+	{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
+	{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
+	{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
+	{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
+	{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
+	{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
+	{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
+	{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
+	{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
+	{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
+	{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
+	{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
+	{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
+	{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
+	{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
+	{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
+};
+
 ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {	
-	// CIE colour matching functions xBar, yBar, and zBar for
-	//	 wavelengths from 380 through 780 nanometers, every 5
-	//	 nanometers.  For a wavelength lambda in this range:
-	//		  cie_colour_match[(lambda - 380) / 5][0] = xBar
-	//		  cie_colour_match[(lambda - 380) / 5][1] = yBar
-	//		  cie_colour_match[(lambda - 380) / 5][2] = zBar
-	const float cie_colour_match[81][3] = {
-		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
-		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
-		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
-		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
-		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
-		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
-		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
-		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
-		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
-		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
-		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
-		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
-		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
-		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
-		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
-		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
-		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
-		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
-		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
-		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
-		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
-		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
-		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
-		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
-		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
-		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
-		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
-	};
-
 	float lambda_nm = stack_load_float(stack, wavelength);
 	float ii = (lambda_nm-380.0f) * (1.0f/5.0f);  // scaled 0..80
 	int i = float_to_int(ii);
@@ -82,7 +82,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	}
 	else {
 		ii -= i;
-		const float *c = cie_colour_match[i];
+		ccl_constant float *c = cie_colour_match[i];
 		color = interp(make_float3(c[0], c[1], c[2]), make_float3(c[3], c[4], c[5]), ii);
 	}
 	
@@ -92,8 +92,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	/* Clamp to zero if values are smaller */
 	color = max(color, make_float3(0.0f, 0.0f, 0.0f));
 
-	if(stack_valid(color_out))
-		stack_store_float3(stack, color_out, color);
+	stack_store_float3(stack, color_out, color);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 87e40791333..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
                                   float3 *P)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
 		else
-			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
-			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	 * With OpenCL 2.0 it's possible to avoid this change, but for until
 	 * then we'll be living with such an exception.
 	 */
-	float3 P = ccl_fetch(sd, P);
+	float3 P = sd->P;
 	float f = wireframe(kg, sd, size, pixel_size, &P);
 #else
-	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
 #endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+		float3 Px = sd->P - sd->dP.dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+		float3 Py = sd->P - sd->dP.dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
 	}
 
 	if(stack_valid(out_fac))
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 8eaa9de3874..b7248354abd 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../bvh
-	../subd
-	../util
+	..
 	../../glew-mx
 )
 
@@ -31,6 +23,7 @@ set(SRC
 	mesh.cpp
 	mesh_displace.cpp
 	mesh_subdivision.cpp
+	mesh_volume.cpp
 	nodes.cpp
 	object.cpp
 	osl.cpp
@@ -79,4 +72,4 @@ include_directories(SYSTEM ${INC_SYS})
 
 add_definitions(${GL_DEFINITIONS})
 
-add_library(cycles_render ${SRC} ${SRC_HEADERS})
+cycles_add_library(cycles_render ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index c0d429a583c..8c77687d9cc 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "mesh.h"
-#include "attribute.h"
+#include "render/image.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -268,6 +267,8 @@ const char *Attribute::standard_name(AttributeStandard std)
 			return "particle";
 		case ATTR_STD_CURVE_INTERCEPT:
 			return "curve_intercept";
+		case ATTR_STD_CURVE_RANDOM:
+			return "curve_random";
 		case ATTR_STD_PTEX_FACE_ID:
 			return "ptex_face_id";
 		case ATTR_STD_PTEX_UV:
@@ -280,6 +281,8 @@ const char *Attribute::standard_name(AttributeStandard std)
 			return "flame";
 		case ATTR_STD_VOLUME_HEAT:
 			return "heat";
+		case ATTR_STD_VOLUME_TEMPERATURE:
+			return "temperature";
 		case ATTR_STD_VOLUME_VELOCITY:
 			return "velocity";
 		case ATTR_STD_POINTINESS:
@@ -295,9 +298,13 @@ const char *Attribute::standard_name(AttributeStandard std)
 
 AttributeStandard Attribute::name_standard(const char *name)
 {
-	for(int std = ATTR_STD_NONE; std < ATTR_STD_NUM; std++)
-		if(strcmp(name, Attribute::standard_name((AttributeStandard)std)) == 0)
-			return (AttributeStandard)std;
+	if(name) {
+		for(int std = ATTR_STD_NONE; std < ATTR_STD_NUM; std++) {
+			if(strcmp(name, Attribute::standard_name((AttributeStandard)std)) == 0) {
+				return (AttributeStandard)std;
+			}
+		}
+	}
 
 	return ATTR_STD_NONE;
 }
@@ -424,6 +431,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_VOLUME_DENSITY:
 			case ATTR_STD_VOLUME_FLAME:
 			case ATTR_STD_VOLUME_HEAT:
+			case ATTR_STD_VOLUME_TEMPERATURE:
 				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
 				break;
 			case ATTR_STD_VOLUME_COLOR:
@@ -452,6 +460,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_CURVE_INTERCEPT:
 				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
 				break;
+			case ATTR_STD_CURVE_RANDOM:
+				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
+				break;
 			case ATTR_STD_GENERATED_TRANSFORM:
 				attr = add(name, TypeDesc::TypeMatrix, ATTR_ELEMENT_MESH);
 				break;
@@ -502,6 +513,16 @@ Attribute *AttributeSet::find(AttributeRequest& req)
 		return find(req.std);
 }
 
+void AttributeSet::remove(Attribute *attribute)
+{
+	if(attribute->std == ATTR_STD_NONE) {
+		remove(attribute->name);
+	}
+	else {
+		remove(attribute->std);
+	}
+}
+
 void AttributeSet::resize(bool reserve_only)
 {
 	foreach(Attribute& attr, attributes) {
@@ -514,9 +535,23 @@ void AttributeSet::resize(bool reserve_only)
 	}
 }
 
-void AttributeSet::clear()
+void AttributeSet::clear(bool preserve_voxel_data)
 {
-	attributes.clear();
+	if(preserve_voxel_data) {
+		list<Attribute>::iterator it;
+
+		for(it = attributes.begin(); it != attributes.end();) {
+			if(it->element == ATTR_ELEMENT_VOXEL || it->std == ATTR_STD_GENERATED_TRANSFORM) {
+				it++;
+			}
+			else {
+				attributes.erase(it++);
+			}
+		}
+	}
+	else {
+		attributes.clear();
+	}
 }
 
 /* AttributeRequest */
@@ -598,9 +633,11 @@ bool AttributeRequestSet::modified(const AttributeRequestSet& other)
 
 void AttributeRequestSet::add(ustring name)
 {
-	foreach(AttributeRequest& req, requests)
-		if(req.name == name)
+	foreach(AttributeRequest& req, requests) {
+		if(req.name == name) {
 			return;
+		}
+	}
 
 	requests.push_back(AttributeRequest(name));
 }
@@ -624,6 +661,22 @@ void AttributeRequestSet::add(AttributeRequestSet& reqs)
 	}
 }
 
+void AttributeRequestSet::add_standard(ustring name)
+{
+	if(!name) {
+		return;
+	}
+
+	AttributeStandard std = Attribute::name_standard(name.c_str());
+
+	if(std) {
+		add(std);
+	}
+	else {
+		add(name);
+	}
+}
+
 bool AttributeRequestSet::find(ustring name)
 {
 	foreach(AttributeRequest& req, requests)
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index f4538c76369..5cb6c75aab2 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -17,12 +17,12 @@
 #ifndef __ATTRIBUTE_H__
 #define __ATTRIBUTE_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_param.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_param.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -120,8 +120,10 @@ public:
 
 	Attribute *find(AttributeRequest& req);
 
+	void remove(Attribute *attribute);
+
 	void resize(bool reserve_only = false);
-	void clear();
+	void clear(bool preserve_voxel_data = false);
 };
 
 /* AttributeRequest
@@ -157,6 +159,7 @@ public:
 	void add(ustring name);
 	void add(AttributeStandard std);
 	void add(AttributeRequestSet& reqs);
+	void add_standard(ustring name);
 
 	bool find(ustring name);
 	bool find(AttributeStandard std);
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index 8d7d7b847fd..df3b65be110 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "graph.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,7 +38,10 @@ NODE_DEFINE(Background)
 	SOCKET_BOOLEAN(use_shader, "Use Shader", true);
 	SOCKET_BOOLEAN(use_ao, "Use AO", false);
 	SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY);
+
 	SOCKET_BOOLEAN(transparent, "Transparent", false);
+	SOCKET_BOOLEAN(transparent_glass, "Transparent Glass", false);
+	SOCKET_FLOAT(transparent_roughness_threshold, "Transparent Roughness Threshold", 0.0f);
 
 	SOCKET_NODE(shader, "Shader", &Shader::node_type);
 
@@ -74,18 +77,22 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	/* set shader index and transparent option */
 	KernelBackground *kbackground = &dscene->data.background;
 
-	if(use_ao) {
-		kbackground->ao_factor = ao_factor;
-		kbackground->ao_distance = ao_distance;
-	}
-	else {
-		kbackground->ao_factor = 0.0f;
-		kbackground->ao_distance = FLT_MAX;
-	}
+	kbackground->ao_factor = (use_ao)? ao_factor: 0.0f;
+	kbackground->ao_bounces_factor = ao_factor;
+	kbackground->ao_distance = ao_distance;
 
 	kbackground->transparent = transparent;
 	kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader);
 
+	if(transparent && transparent_glass) {
+		/* Square twice, once for principled BSDF convention, and once for
+		 * faster comparison in kernel with anisotropic roughness. */
+		kbackground->transparent_roughness_squared_threshold = sqr(sqr(transparent_roughness_threshold));
+	}
+	else {
+		kbackground->transparent_roughness_squared_threshold = -1.0f;
+	}
+
 	if(bg_shader->has_volume)
 		kbackground->volume_shader = kbackground->surface_shader;
 	else
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 8029c6a9e80..145c05f1c18 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -17,9 +17,9 @@
 #ifndef __BACKGROUND_H__
 #define __BACKGROUND_H__
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,7 +30,7 @@ class Shader;
 
 class Background : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float ao_factor;
 	float ao_distance;
@@ -42,6 +42,9 @@ public:
 	Shader *shader;
 
 	bool transparent;
+	bool transparent_glass;
+	float transparent_roughness_threshold;
+
 	bool need_update;
 
 	Background();
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index d9a297002c6..927e04abc7f 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -14,8 +14,13 @@
  * limitations under the License.
  */
 
-#include "bake.h"
-#include "integrator.h"
+#include "render/bake.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/shader.h"
+#include "render/integrator.h"
+
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -135,7 +140,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 {
 	size_t num_pixels = bake_data->size();
 
-	int num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1;
+	int num_samples = aa_samples(scene, bake_data, shader_type);
 
 	/* calculate the total pixel samples for the progress bar */
 	total_pixel_samples = 0;
@@ -146,12 +151,16 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 	progress.reset_sample();
 	progress.set_total_pixel_samples(total_pixel_samples);
 
+	/* needs to be up to date for baking specific AA samples */
+	dscene->data.integrator.aa_samples = num_samples;
+	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
 	for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
 		size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
 
 		/* setup input for device task */
-		device_vector<uint4> d_input;
-		uint4 *d_input_data = d_input.resize(shader_size * 2);
+		device_vector<uint4> d_input(device, "bake_input", MEM_READ_ONLY);
+		uint4 *d_input_data = d_input.alloc(shader_size * 2);
 		size_t d_input_size = 0;
 
 		for(size_t i = shader_offset; i < (shader_offset + shader_size); i++) {
@@ -165,15 +174,10 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		}
 
 		/* run device task */
-		device_vector<float4> d_output;
-		d_output.resize(shader_size);
-
-		/* needs to be up to data for attribute access */
-		device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
-
-		device->mem_alloc(d_input, MEM_READ_ONLY);
-		device->mem_copy_to(d_input);
-		device->mem_alloc(d_output, MEM_READ_WRITE);
+		device_vector<float4> d_output(device, "bake_output", MEM_READ_WRITE);
+		d_output.alloc(shader_size);
+		d_output.zero_to_device();
+		d_input.copy_to_device();
 
 		DeviceTask task(DeviceTask::SHADER);
 		task.shader_input = d_input.device_pointer;
@@ -191,20 +195,19 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		device->task_wait();
 
 		if(progress.get_cancel()) {
-			device->mem_free(d_input);
-			device->mem_free(d_output);
+			d_input.free();
+			d_output.free();
 			m_is_baking = false;
 			return false;
 		}
 
-		device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
-		device->mem_free(d_input);
-		device->mem_free(d_output);
+		d_output.copy_from_device(0, 1, d_output.size());
+		d_input.free();
 
 		/* read result */
 		int k = 0;
 
-		float4 *offset = (float4*)d_output.data_pointer;
+		float4 *offset = d_output.data();
 
 		size_t depth = 4;
 		for(size_t i=shader_offset; i < (shader_offset + shader_size); i++) {
@@ -217,6 +220,8 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 				}
 			}
 		}
+
+		d_output.free();
 	}
 
 	m_is_baking = false;
@@ -240,14 +245,27 @@ void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 }
 
-bool BakeManager::is_aa_pass(ShaderEvalType type)
+int BakeManager::aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type)
 {
-	switch(type) {
-		case SHADER_EVAL_UV:
-		case SHADER_EVAL_NORMAL:
-			return false;
-		default:
-			return true;
+	if(type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
+		return 1;
+	}
+	else if(type == SHADER_EVAL_NORMAL) {
+		/* Only antialias normal if mesh has bump mapping. */
+		Object *object = scene->objects[bake_data->object()];
+
+		if(object->mesh) {
+			foreach(Shader *shader, object->mesh->used_shaders) {
+				if(shader->has_bump) {
+					return scene->integrator->aa_samples;
+				}
+			}
+		}
+
+		return 1;
+	}
+	else {
+		return scene->integrator->aa_samples;
 	}
 }
 
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index aed9c5a8e75..fbb8686b8f6 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -17,11 +17,11 @@
 #ifndef __BAKE_H__
 #define __BAKE_H__
 
-#include "device.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/scene.h"
 
-#include "util_progress.h"
-#include "util_vector.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -69,7 +69,7 @@ public:
 	void device_free(Device *device, DeviceScene *dscene);
 
 	static int shader_type_to_pass_filter(ShaderEvalType type, const int pass_filter);
-	static bool is_aa_pass(ShaderEvalType type);
+	static int aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type);
 
 	bool need_update;
 
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f1692712d61..6f560380b40 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -16,17 +16,15 @@
 
 #include <stdlib.h>
 
-#include "buffers.h"
-#include "device.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_image.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
+#include "render/buffers.h"
+#include "device/device.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,6 +40,9 @@ BufferParams::BufferParams()
 	full_width = 0;
 	full_height = 0;
 
+	denoising_data_pass = false;
+	denoising_clean_pass = false;
+
 	Pass::add(PASS_COMBINED, passes);
 }
 
@@ -68,10 +69,25 @@ int BufferParams::get_passes_size()
 
 	for(size_t i = 0; i < passes.size(); i++)
 		size += passes[i].components;
-	
+
+	if(denoising_data_pass) {
+		size += DENOISING_PASS_SIZE_BASE;
+		if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+	}
+
 	return align_up(size, 4);
 }
 
+int BufferParams::get_denoising_offset()
+{
+	int offset = 0;
+
+	for(size_t i = 0; i < passes.size(); i++)
+		offset += passes[i].components;
+
+	return offset;
+}
+
 /* Render Buffer Task */
 
 RenderTile::RenderTile()
@@ -90,66 +106,124 @@ RenderTile::RenderTile()
 	stride = 0;
 
 	buffer = 0;
-	rng_state = 0;
 
 	buffers = NULL;
 }
 
 /* Render Buffers */
 
-RenderBuffers::RenderBuffers(Device *device_)
+RenderBuffers::RenderBuffers(Device *device)
+: buffer(device, "RenderBuffers", MEM_READ_WRITE),
+  map_neighbor_copied(false), render_time(0.0f)
 {
-	device = device_;
 }
 
 RenderBuffers::~RenderBuffers()
 {
-	device_free();
+	buffer.free();
 }
 
-void RenderBuffers::device_free()
+void RenderBuffers::reset(BufferParams& params_)
 {
-	if(buffer.device_pointer) {
-		device->mem_free(buffer);
-		buffer.clear();
-	}
+	params = params_;
 
-	if(rng_state.device_pointer) {
-		device->mem_free(rng_state);
-		rng_state.clear();
-	}
+	/* re-allocate buffer */
+	buffer.alloc(params.width*params.height*params.get_passes_size());
+	buffer.zero_to_device();
 }
 
-void RenderBuffers::reset(Device *device, BufferParams& params_)
+void RenderBuffers::zero()
 {
-	params = params_;
+	buffer.zero_to_device();
+}
 
-	/* free existing buffers */
-	device_free();
-	
-	/* allocate buffer */
-	buffer.resize(params.width*params.height*params.get_passes_size());
-	device->mem_alloc(buffer, MEM_READ_WRITE);
-	device->mem_zero(buffer);
+bool RenderBuffers::copy_from_device()
+{
+	if(!buffer.device_pointer)
+		return false;
 
-	/* allocate rng state */
-	rng_state.resize(params.width, params.height);
+	buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height);
 
-	device->mem_alloc(rng_state, MEM_READ_WRITE);
+	return true;
 }
 
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
 {
-	if(!buffer.device_pointer)
+	if(buffer.data() == NULL) {
 		return false;
+	}
+
+	float invsample = 1.0f/sample;
+	float scale = invsample;
+	bool variance = (offset == DENOISING_PASS_NORMAL_VAR) ||
+	                (offset == DENOISING_PASS_ALBEDO_VAR) ||
+	                (offset == DENOISING_PASS_DEPTH_VAR) ||
+	                (offset == DENOISING_PASS_COLOR_VAR);
+
+	if(offset == DENOISING_PASS_COLOR) {
+		scale *= exposure;
+	}
+	else if(offset == DENOISING_PASS_COLOR_VAR) {
+		scale *= exposure*exposure;
+	}
+
+	offset += params.get_denoising_offset();
+	int pass_stride = params.get_passes_size();
+	int size = params.width*params.height;
 
-	device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+	if(variance) {
+		/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+		 * update does not work efficiently with atomics in the kernel. */
+		int mean_offset = offset - components;
+		float *mean = buffer.data() + mean_offset;
+		float *var = buffer.data() + offset;
+		assert(mean_offset >= 0);
+
+		if(components == 1) {
+			for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) {
+				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale;
+			}
+		}
+		else if(components == 3) {
+			for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) {
+				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale;
+				pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale;
+				pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale;
+			}
+		}
+		else {
+			return false;
+		}
+	}
+	else {
+		float *in = buffer.data() + offset;
+
+		if(components == 1) {
+			for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+				pixels[0] = in[0]*scale;
+			}
+		}
+		else if(components == 3) {
+			for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+				pixels[0] = in[0]*scale;
+				pixels[1] = in[1]*scale;
+				pixels[2] = in[2]*scale;
+			}
+		}
+		else {
+			return false;
+		}
+	}
 
 	return true;
 }
 
 bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels)
 {
+	if(buffer.data() == NULL) {
+		return false;
+	}
+
 	int pass_offset = 0;
 
 	for(size_t j = 0; j < params.passes.size(); j++) {
@@ -160,7 +234,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 			continue;
 		}
 
-		float *in = (float*)buffer.data_pointer + pass_offset;
+		float *in = buffer.data() + pass_offset;
 		int pass_stride = params.get_passes_size();
 
 		float scale = (pass.filter)? 1.0f/(float)sample: 1.0f;
@@ -168,10 +242,17 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 
 		int size = params.width*params.height;
 
-		if(components == 1) {
+		if(components == 1 && type == PASS_RENDER_TIME) {
+			/* Render time is not stored by kernel, but measured per tile. */
+			float val = (float) (1000.0 * render_time/(params.width * params.height * sample));
+			for(int i = 0; i < size; i++, pixels++) {
+				pixels[0] = val;
+			}
+		}
+		else if(components == 1) {
 			assert(pass.components == components);
 
-			/* scalar */
+			/* Scalar */
 			if(type == PASS_DEPTH) {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
 					float f = *in;
@@ -227,7 +308,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pass_offset += color_pass.components;
 				}
 
-				float *in_divide = (float*)buffer.data_pointer + pass_offset;
+				float *in_divide = buffer.data() + pass_offset;
 
 				for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
 					float3 f = make_float3(in[0], in[1], in[2]);
@@ -276,7 +357,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pass_offset += color_pass.components;
 				}
 
-				float *in_weight = (float*)buffer.data_pointer + pass_offset;
+				float *in_weight = buffer.data() + pass_offset;
 
 				for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
 					float4 f = make_float4(in[0], in[1], in[2], in[3]);
@@ -311,50 +392,35 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 
 /* Display Buffer */
 
-DisplayBuffer::DisplayBuffer(Device *device_, bool linear)
+DisplayBuffer::DisplayBuffer(Device *device, bool linear)
+: draw_width(0),
+  draw_height(0),
+  transparent(true), /* todo: determine from background */
+  half_float(linear),
+  rgba_byte(device, "display buffer byte"),
+  rgba_half(device, "display buffer half")
 {
-	device = device_;
-	draw_width = 0;
-	draw_height = 0;
-	transparent = true; /* todo: determine from background */
-	half_float = linear;
 }
 
 DisplayBuffer::~DisplayBuffer()
 {
-	device_free();
+	rgba_byte.free();
+	rgba_half.free();
 }
 
-void DisplayBuffer::device_free()
-{
-	if(rgba_byte.device_pointer) {
-		device->pixels_free(rgba_byte);
-		rgba_byte.clear();
-	}
-	if(rgba_half.device_pointer) {
-		device->pixels_free(rgba_half);
-		rgba_half.clear();
-	}
-}
-
-void DisplayBuffer::reset(Device *device, BufferParams& params_)
+void DisplayBuffer::reset(BufferParams& params_)
 {
 	draw_width = 0;
 	draw_height = 0;
 
 	params = params_;
 
-	/* free existing buffers */
-	device_free();
-
 	/* allocate display pixels */
 	if(half_float) {
-		rgba_half.resize(params.width, params.height);
-		device->pixels_alloc(rgba_half);
+		rgba_half.alloc_to_device(params.width, params.height);
 	}
 	else {
-		rgba_byte.resize(params.width, params.height);
-		device->pixels_alloc(rgba_byte);
+		rgba_byte.alloc_to_device(params.width, params.height);
 	}
 }
 
@@ -369,7 +435,8 @@ void DisplayBuffer::draw_set(int width, int height)
 void DisplayBuffer::draw(Device *device, const DeviceDrawParams& draw_params)
 {
 	if(draw_width != 0 && draw_height != 0) {
-		device_memory& rgba = rgba_data();
+		device_memory& rgba = (half_float)? (device_memory&)rgba_half:
+		                                    (device_memory&)rgba_byte;
 
 		device->draw_pixels(rgba, 0, draw_width, draw_height, params.full_x, params.full_y, params.width, params.height, transparent, draw_params);
 	}
@@ -380,47 +447,5 @@ bool DisplayBuffer::draw_ready()
 	return (draw_width != 0 && draw_height != 0);
 }
 
-void DisplayBuffer::write(Device *device, const string& filename)
-{
-	int w = draw_width;
-	int h = draw_height;
-
-	if(w == 0 || h == 0)
-		return;
-	
-	if(half_float)
-		return;
-
-	/* read buffer from device */
-	device_memory& rgba = rgba_data();
-	device->pixels_copy_from(rgba, 0, w, h);
-
-	/* write image */
-	ImageOutput *out = ImageOutput::create(filename);
-	ImageSpec spec(w, h, 4, TypeDesc::UINT8);
-	int scanlinesize = w*4*sizeof(uchar);
-
-	out->open(filename, spec);
-
-	/* conversion for different top/bottom convention */
-	out->write_image(TypeDesc::UINT8,
-		(uchar*)rgba.data_pointer + (h-1)*scanlinesize,
-		AutoStride,
-		-scanlinesize,
-		AutoStride);
-
-	out->close();
-
-	delete out;
-}
-
-device_memory& DisplayBuffer::rgba_data()
-{
-	if(half_float)
-		return rgba_half;
-	else
-		return rgba_byte;
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index c9c2a21079a..dfc98fe2061 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -17,16 +17,16 @@
 #ifndef __BUFFERS_H__
 #define __BUFFERS_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "film.h"
+#include "render/film.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_half.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,6 +51,9 @@ public:
 
 	/* passes */
 	array<Pass> passes;
+	bool denoising_data_pass;
+	/* If only some light path types should be denoised, an additional pass is needed. */
+	bool denoising_clean_pass;
 
 	/* functions */
 	BufferParams();
@@ -59,6 +62,7 @@ public:
 	bool modified(const BufferParams& params);
 	void add_pass(PassType type);
 	int get_passes_size();
+	int get_denoising_offset();
 };
 
 /* Render Buffers */
@@ -70,21 +74,18 @@ public:
 
 	/* float buffer */
 	device_vector<float> buffer;
-	/* random number generator state */
-	device_vector<uint> rng_state;
+	bool map_neighbor_copied;
+	double render_time;
 
 	explicit RenderBuffers(Device *device);
 	~RenderBuffers();
 
-	void reset(Device *device, BufferParams& params);
+	void reset(BufferParams& params);
+	void zero();
 
 	bool copy_from_device();
 	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
-
-protected:
-	void device_free();
-
-	Device *device;
+	bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
 };
 
 /* Display Buffer
@@ -105,25 +106,17 @@ public:
 	/* use half float? */
 	bool half_float;
 	/* byte buffer for converted result */
-	device_vector<uchar4> rgba_byte;
-	device_vector<half4> rgba_half;
+	device_pixels<uchar4> rgba_byte;
+	device_pixels<half4> rgba_half;
 
 	DisplayBuffer(Device *device, bool linear = false);
 	~DisplayBuffer();
 
-	void reset(Device *device, BufferParams& params);
-	void write(Device *device, const string& filename);
+	void reset(BufferParams& params);
 
 	void draw_set(int width, int height);
 	void draw(Device *device, const DeviceDrawParams& draw_params);
 	bool draw_ready();
-
-	device_memory& rgba_data();
-
-protected:
-	void device_free();
-
-	Device *device;
 };
 
 /* Render Tile
@@ -131,6 +124,9 @@ protected:
 
 class RenderTile {
 public:
+	typedef enum { PATH_TRACE, DENOISE } Task;
+
+	Task task;
 	int x, y, w, h;
 	int start_sample;
 	int num_samples;
@@ -138,9 +134,9 @@ public:
 	int resolution;
 	int offset;
 	int stride;
+	int tile_index;
 
 	device_ptr buffer;
-	device_ptr rng_state;
 
 	RenderBuffers *buffers;
 
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index c8c51ec96d2..38936ffc094 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -14,18 +14,27 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "device.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_math_cdf.h"
-#include "util_vector.h"
+#include "render/camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "device/device.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_math_cdf.h"
+#include "util/util_vector.h"
+
+/* needed for calculating differentials */
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_camera.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -73,6 +82,7 @@ NODE_DEFINE(Camera)
 	SOCKET_FLOAT(bladesrotation, "Blades Rotation", 0.0f);
 
 	SOCKET_TRANSFORM(matrix, "Matrix", transform_identity());
+	SOCKET_TRANSFORM_ARRAY(motion, "Motion", array<Transform>());
 
 	SOCKET_FLOAT(aperture_ratio, "Aperture Ratio", 1.0f);
 
@@ -128,6 +138,8 @@ NODE_DEFINE(Camera)
 	SOCKET_FLOAT(border.bottom, "Border Bottom", 0);
 	SOCKET_FLOAT(border.top, "Border Top", 0);
 
+	SOCKET_FLOAT(offscreen_dicing_scale, "Offscreen Dicing Scale", 1.0f);
+
 	return type;
 }
 
@@ -140,9 +152,6 @@ Camera::Camera()
 	height = 512;
 	resolution = 1;
 
-	motion.pre = transform_identity();
-	motion.post = transform_identity();
-	use_motion = false;
 	use_perspective_motion = false;
 
 	shutter_curve.resize(RAMP_TABLE_SIZE);
@@ -152,12 +161,12 @@ Camera::Camera()
 
 	compute_auto_viewplane();
 
-	screentoworld = transform_identity();
-	rastertoworld = transform_identity();
-	ndctoworld = transform_identity();
-	rastertocamera = transform_identity();
+	screentoworld = projection_identity();
+	rastertoworld = projection_identity();
+	ndctoworld = projection_identity();
+	rastertocamera = projection_identity();
 	cameratoworld = transform_identity();
-	worldtoraster = transform_identity();
+	worldtoraster = projection_identity();
 
 	dx = make_float3(0.0f, 0.0f, 0.0f);
 	dy = make_float3(0.0f, 0.0f, 0.0f);
@@ -166,6 +175,8 @@ Camera::Camera()
 	need_device_update = true;
 	need_flags_update = true;
 	previous_need_motion = -1;
+
+	memset(&kernel_camera, 0, sizeof(kernel_camera));
 }
 
 Camera::~Camera()
@@ -197,8 +208,17 @@ void Camera::compute_auto_viewplane()
 	}
 }
 
-void Camera::update()
+void Camera::update(Scene *scene)
 {
+	Scene::MotionType need_motion = scene->need_motion();
+
+	if(previous_need_motion != need_motion) {
+		/* scene's motion model could have been changed since previous device
+		 * camera update this could happen for example in case when one render
+		 * layer has got motion pass and another not */
+		need_device_update = true;
+	}
+
 	if(!need_update)
 		return;
 
@@ -219,18 +239,18 @@ void Camera::update()
 	Transform full_rastertoscreen = transform_inverse(full_screentoraster);
 
 	/* screen to camera */
-	Transform cameratoscreen;
+	ProjectionTransform cameratoscreen;
 	if(type == CAMERA_PERSPECTIVE)
-		cameratoscreen = transform_perspective(fov, nearclip, farclip);
+		cameratoscreen = projection_perspective(fov, nearclip, farclip);
 	else if(type == CAMERA_ORTHOGRAPHIC)
-		cameratoscreen = transform_orthographic(nearclip, farclip);
+		cameratoscreen = projection_orthographic(nearclip, farclip);
 	else
-		cameratoscreen = transform_identity();
+		cameratoscreen = projection_identity();
 	
-	Transform screentocamera = transform_inverse(cameratoscreen);
+	ProjectionTransform screentocamera = projection_inverse(cameratoscreen);
 
 	rastertocamera = screentocamera * rastertoscreen;
-	Transform full_rastertocamera = screentocamera * full_rastertoscreen;
+	ProjectionTransform full_rastertocamera = screentocamera * full_rastertoscreen;
 	cameratoraster = screentoraster * cameratoscreen;
 
 	cameratoworld = matrix;
@@ -248,10 +268,10 @@ void Camera::update()
 
 	/* differentials */
 	if(type == CAMERA_ORTHOGRAPHIC) {
-		dx = transform_direction(&rastertocamera, make_float3(1, 0, 0));
-		dy = transform_direction(&rastertocamera, make_float3(0, 1, 0));
-		full_dx = transform_direction(&full_rastertocamera, make_float3(1, 0, 0));
-		full_dy = transform_direction(&full_rastertocamera, make_float3(0, 1, 0));
+		dx = transform_perspective_direction(&rastertocamera, make_float3(1, 0, 0));
+		dy = transform_perspective_direction(&rastertocamera, make_float3(0, 1, 0));
+		full_dx = transform_perspective_direction(&full_rastertocamera, make_float3(1, 0, 0));
+		full_dy = transform_perspective_direction(&full_rastertocamera, make_float3(0, 1, 0));
 	}
 	else if(type == CAMERA_PERSPECTIVE) {
 		dx = transform_perspective(&rastertocamera, make_float3(1, 0, 0)) -
@@ -273,45 +293,15 @@ void Camera::update()
 	full_dx = transform_direction(&cameratoworld, full_dx);
 	full_dy = transform_direction(&cameratoworld, full_dy);
 
-	/* TODO(sergey): Support other types of camera. */
 	if(type == CAMERA_PERSPECTIVE) {
-		/* TODO(sergey): Move to an utility function and de-duplicate with
-		 * calculation above.
-		 */
-		Transform screentocamera_pre =
-		        transform_inverse(transform_perspective(fov_pre,
-		                                                nearclip,
-		                                                farclip));
-		Transform screentocamera_post =
-		        transform_inverse(transform_perspective(fov_post,
-		                                                nearclip,
-		                                                farclip));
-		perspective_motion.pre = screentocamera_pre * rastertoscreen;
-		perspective_motion.post = screentocamera_post * rastertoscreen;
-	}
+		float3 v = transform_perspective(&full_rastertocamera, make_float3(full_width, full_height, 1.0f));
 
-	need_update = false;
-	need_device_update = true;
-	need_flags_update = true;
-}
-
-void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
-{
-	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
-
-	update();
-
-	if(previous_need_motion != need_motion) {
-		/* scene's motion model could have been changed since previous device
-		 * camera update this could happen for example in case when one render
-		 * layer has got motion pass and another not */
-		need_device_update = true;
+		frustum_right_normal = normalize(make_float3(v.z, 0.0f, -v.x));
+		frustum_top_normal = normalize(make_float3(0.0f, v.z, -v.y));
 	}
 
-	if(!need_device_update)
-		return;
-	
-	KernelCamera *kcam = &dscene->data.cam;
+	/* Compute kernel camera data. */
+	KernelCamera *kcam = &kernel_camera;
 
 	/* store matrices */
 	kcam->screentoworld = screentoworld;
@@ -322,46 +312,68 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kcam->worldtoscreen = worldtoscreen;
 	kcam->worldtoraster = worldtoraster;
 	kcam->worldtondc = worldtondc;
+	kcam->ndctoworld = ndctoworld;
 
 	/* camera motion */
-	kcam->have_motion = 0;
+	kcam->num_motion_steps = 0;
 	kcam->have_perspective_motion = 0;
+	kernel_camera_motion.clear();
+
+	/* Test if any of the transforms are actually different. */
+	bool have_motion = false;
+	for(size_t i = 0; i < motion.size(); i++) {
+		have_motion = have_motion || motion[i] != matrix;
+	}
 
 	if(need_motion == Scene::MOTION_PASS) {
 		/* TODO(sergey): Support perspective (zoom, fov) motion. */
 		if(type == CAMERA_PANORAMA) {
-			if(use_motion) {
-				kcam->motion.pre = transform_inverse(motion.pre);
-				kcam->motion.post = transform_inverse(motion.post);
+			if(have_motion) {
+				kcam->motion_pass_pre = transform_inverse(motion[0]);
+				kcam->motion_pass_post = transform_inverse(motion[motion.size()-1]);
 			}
 			else {
-				kcam->motion.pre = kcam->worldtocamera;
-				kcam->motion.post = kcam->worldtocamera;
+				kcam->motion_pass_pre = kcam->worldtocamera;
+				kcam->motion_pass_post = kcam->worldtocamera;
 			}
 		}
 		else {
-			if(use_motion) {
-				kcam->motion.pre = cameratoraster * transform_inverse(motion.pre);
-				kcam->motion.post = cameratoraster * transform_inverse(motion.post);
+			if(have_motion) {
+				kcam->perspective_pre = cameratoraster * transform_inverse(motion[0]);
+				kcam->perspective_post = cameratoraster * transform_inverse(motion[motion.size()-1]);
 			}
 			else {
-				kcam->motion.pre = worldtoraster;
-				kcam->motion.post = worldtoraster;
+				kcam->perspective_pre = worldtoraster;
+				kcam->perspective_post = worldtoraster;
 			}
 		}
 	}
-#ifdef __CAMERA_MOTION__
 	else if(need_motion == Scene::MOTION_BLUR) {
-		if(use_motion) {
-			transform_motion_decompose((DecompMotionTransform*)&kcam->motion, &motion, &matrix);
-			kcam->have_motion = 1;
+		if(have_motion) {
+			kernel_camera_motion.resize(motion.size());
+			transform_motion_decompose(kernel_camera_motion.data(), motion.data(), motion.size());
+			kcam->num_motion_steps = motion.size();
 		}
-		if(use_perspective_motion) {
-			kcam->perspective_motion = perspective_motion;
+
+		/* TODO(sergey): Support other types of camera. */
+		if(use_perspective_motion && type == CAMERA_PERSPECTIVE) {
+			/* TODO(sergey): Move to an utility function and de-duplicate with
+			 * calculation above.
+			 */
+			ProjectionTransform screentocamera_pre =
+					projection_inverse(projection_perspective(fov_pre,
+					                                          nearclip,
+					                                          farclip));
+			ProjectionTransform screentocamera_post =
+					projection_inverse(projection_perspective(fov_post,
+					                                          nearclip,
+					                                          farclip));
+
+			kcam->perspective_pre = screentocamera_pre * rastertoscreen;
+			kcam->perspective_post = screentocamera_post * rastertoscreen;
 			kcam->have_perspective_motion = 1;
 		}
 	}
-#endif
 
 	/* depth of field */
 	kcam->aperturesize = aperturesize;
@@ -370,26 +382,8 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kcam->bladesrotation = bladesrotation;
 
 	/* motion blur */
-#ifdef __CAMERA_MOTION__
 	kcam->shuttertime = (need_motion == Scene::MOTION_BLUR) ? shuttertime: -1.0f;
 
-	scene->lookup_tables->remove_table(&shutter_table_offset);
-	if(need_motion == Scene::MOTION_BLUR) {
-		vector<float> shutter_table;
-		util_cdf_inverted(SHUTTER_TABLE_SIZE,
-		                  0.0f,
-		                  1.0f,
-		                  function_bind(shutter_curve_eval, _1, shutter_curve),
-		                  false,
-		                  shutter_table);
-		shutter_table_offset = scene->lookup_tables->add_table(dscene,
-		                                                       shutter_table);
-		kcam->shutter_table_offset = (int)shutter_table_offset;
-	}
-#else
-	kcam->shuttertime = -1.0f;
-#endif
-
 	/* type */
 	kcam->type = type;
 
@@ -450,9 +444,49 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kcam->rolling_shutter_type = rolling_shutter_type;
 	kcam->rolling_shutter_duration = rolling_shutter_duration;
 
+	/* Set further update flags */
+	need_update = false;
+	need_device_update = true;
+	need_flags_update = true;
 	previous_need_motion = need_motion;
 }
 
+void Camera::device_update(Device * /* device */,
+                           DeviceScene *dscene,
+                           Scene *scene)
+{
+	update(scene);
+
+	if(!need_device_update)
+		return;
+
+	scene->lookup_tables->remove_table(&shutter_table_offset);
+	if(kernel_camera.shuttertime != -1.0f) {
+		vector<float> shutter_table;
+		util_cdf_inverted(SHUTTER_TABLE_SIZE,
+		                  0.0f,
+		                  1.0f,
+		                  function_bind(shutter_curve_eval, _1, shutter_curve),
+		                  false,
+		                  shutter_table);
+		shutter_table_offset = scene->lookup_tables->add_table(dscene,
+		                                                       shutter_table);
+		kernel_camera.shutter_table_offset = (int)shutter_table_offset;
+	}
+
+	dscene->data.cam = kernel_camera;
+
+	size_t num_motion_steps = kernel_camera_motion.size();
+	if(num_motion_steps) {
+		DecomposedTransform *camera_motion = dscene->camera_motion.alloc(num_motion_steps);
+		memcpy(camera_motion, kernel_camera_motion.data(), sizeof(*camera_motion) * num_motion_steps);
+		dscene->camera_motion.copy_to_device();
+	}
+	else {
+		dscene->camera_motion.free();
+	}
+}
+
 void Camera::device_update_volume(Device * /*device*/,
                                   DeviceScene *dscene,
                                   Scene *scene)
@@ -477,10 +511,11 @@ void Camera::device_update_volume(Device * /*device*/,
 }
 
 void Camera::device_free(Device * /*device*/,
-                         DeviceScene * /*dscene*/,
+                         DeviceScene *dscene,
                          Scene *scene)
 {
 	scene->lookup_tables->remove_table(&shutter_table_offset);
+	dscene->camera_motion.free();
 }
 
 bool Camera::modified(const Camera& cam)
@@ -491,7 +526,6 @@ bool Camera::modified(const Camera& cam)
 bool Camera::motion_modified(const Camera& cam)
 {
 	return !((motion == cam.motion) &&
-	         (use_motion == cam.use_motion) &&
 	         (use_perspective_motion == cam.use_perspective_motion));
 }
 
@@ -581,8 +615,27 @@ BoundBox Camera::viewplane_bounds_get()
 
 float Camera::world_to_raster_size(float3 P)
 {
+	float res = 1.0f;
+
 	if(type == CAMERA_ORTHOGRAPHIC) {
-		return min(len(full_dx), len(full_dy));
+		res = min(len(full_dx), len(full_dy));
+
+		if(offscreen_dicing_scale > 1.0f) {
+			float3 p = transform_point(&worldtocamera, P);
+			float3 v = transform_perspective(&rastertocamera, make_float3(width, height, 0.0f));
+
+			/* Create point clamped to frustum */
+			float3 c;
+			c.x = max(-v.x, min(v.x, p.x));
+			c.y = max(-v.y, min(v.y, p.y));
+			c.z = max(0.0f, p.z);
+
+			float f_dist = len(p - c) / sqrtf((v.x*v.x+v.y*v.y)*0.5f);
+
+			if(f_dist > 0.0f) {
+				res += res * f_dist * (offscreen_dicing_scale - 1.0f);
+			}
+		}
 	}
 	else if(type == CAMERA_PERSPECTIVE) {
 		/* Calculate as if point is directly ahead of the camera. */
@@ -597,14 +650,121 @@ float Camera::world_to_raster_size(float3 P)
 		/* dPdx */
 		float dist = len(transform_point(&worldtocamera, P));
 		float3 D = normalize(Ddiff);
-		return len(dist*dDdx - dot(dist*dDdx, D)*D);
+		res = len(dist*dDdx - dot(dist*dDdx, D)*D);
+
+		/* Decent approx distance to frustum (doesn't handle corners correctly, but not that big of a deal) */
+		float f_dist = 0.0f;
+
+		if(offscreen_dicing_scale > 1.0f) {
+			float3 p = transform_point(&worldtocamera, P);
+
+			/* Distance from the four planes */
+			float r = dot(p, frustum_right_normal);
+			float t = dot(p, frustum_top_normal);
+			p = make_float3(-p.x, -p.y, p.z);
+			float l = dot(p, frustum_right_normal);
+			float b = dot(p, frustum_top_normal);
+			p = make_float3(-p.x, -p.y, p.z);
+
+			if(r <= 0.0f && l <= 0.0f && t <= 0.0f && b <= 0.0f) {
+				/* Point is inside frustum */
+				f_dist = 0.0f;
+			}
+			else if(r > 0.0f && l > 0.0f && t > 0.0f && b > 0.0f) {
+				/* Point is behind frustum */
+				f_dist = len(p);
+			}
+			else {
+				/* Point may be behind or off to the side, need to check */
+				float3 along_right = make_float3(-frustum_right_normal.z, 0.0f, frustum_right_normal.x);
+				float3 along_left = make_float3(frustum_right_normal.z, 0.0f, frustum_right_normal.x);
+				float3 along_top = make_float3(0.0f, -frustum_top_normal.z, frustum_top_normal.y);
+				float3 along_bottom = make_float3(0.0f, frustum_top_normal.z, frustum_top_normal.y);
+
+				float dist[] = {r, l, t, b};
+				float3 along[] = {along_right, along_left, along_top, along_bottom};
+
+				bool test_o = false;
+
+				float *d = dist;
+				float3 *a = along;
+				for(int i = 0; i < 4; i++, d++, a++) {
+					/* Test if we should check this side at all */
+					if(*d > 0.0f) {
+						if(dot(p, *a) >= 0.0f) {
+							/* We are in front of the back edge of this side of the frustum */
+							f_dist = max(f_dist, *d);
+						}
+						else {
+							/* Possibly far enough behind the frustum to use distance to origin instead of edge */
+							test_o = true;
+						}
+					}
+				}
+
+				if(test_o) {
+					f_dist = (f_dist > 0) ? min(f_dist, len(p)) : len(p);
+				}
+			}
+
+			if(f_dist > 0.0f) {
+				res += len(dDdx - dot(dDdx, D)*D) * f_dist * (offscreen_dicing_scale - 1.0f);
+			}
+		}
 	}
-	else {
-		// TODO(mai): implement for CAMERA_PANORAMA
-		assert(!"pixel width calculation for panoramic projection not implemented yet");
+	else if(type == CAMERA_PANORAMA) {
+		float3 D = transform_point(&worldtocamera, P);
+		float dist = len(D);
+
+		Ray ray;
+
+		/* Distortion can become so great that the results become meaningless, there
+		 * may be a better way to do this, but calculating differentials from the
+		 * point directly ahead seems to produce good enough results. */
+#if 0
+		float2 dir = direction_to_panorama(&kernel_camera, kernel_camera_motion.data(), normalize(D));
+		float3 raster = transform_perspective(&cameratoraster, make_float3(dir.x, dir.y, 0.0f));
+
+		ray.t = 1.0f;
+		camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), raster.x, raster.y, 0.0f, 0.0f, &ray);
+		if(ray.t == 0.0f) {
+			/* No differentials, just use from directly ahead. */
+			camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray);
+		}
+#else
+		camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray);
+#endif
+
+		differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);
+
+		return max(len(ray.dP.dx) * (float(width)/float(full_width)),
+		           len(ray.dP.dy) * (float(height)/float(full_height)));
+	}
+
+	return res;
+}
+
+bool Camera::use_motion() const
+{
+	return motion.size() > 1;
+}
+
+float Camera::motion_time(int step) const
+{
+	return (use_motion()) ? 2.0f * step / (motion.size() - 1) - 1.0f : 0.0f;
+}
+
+int Camera::motion_step(float time) const
+{
+	if(use_motion()) {
+		for(int step = 0; step < motion.size(); step++) {
+			if(time == motion_time(step)) {
+				return step;
+			}
+		}
 	}
 
-	return 1.0f;
+	return -1;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 141ef9cccef..37d05c01bd9 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -17,13 +17,14 @@
 #ifndef __CAMERA_H__
 #define __CAMERA_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_boundbox.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_projection.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -39,7 +40,7 @@ class Scene;
 
 class Camera : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Specifies an offset for the shutter's time interval. */
 	enum MotionPosition {
@@ -129,6 +130,8 @@ public:
 	BoundBox2D viewplane;
 	/* width and height change during preview, so we need these for calculating dice rates. */
 	int full_width, full_height;
+	/* controls how fast the dicing rate falls off for geometry out side of view */
+	float offscreen_dicing_scale;
 
 	/* border */
 	BoundBox2D border;
@@ -138,24 +141,23 @@ public:
 	Transform matrix;
 
 	/* motion */
-	MotionTransform motion;
-	bool use_motion, use_perspective_motion;
+	array<Transform> motion;
+	bool use_perspective_motion;
 	float fov_pre, fov_post;
-	PerspectiveMotionTransform perspective_motion;
 
 	/* computed camera parameters */
-	Transform screentoworld;
-	Transform rastertoworld;
-	Transform ndctoworld;
+	ProjectionTransform screentoworld;
+	ProjectionTransform rastertoworld;
+	ProjectionTransform ndctoworld;
 	Transform cameratoworld;
 
-	Transform worldtoraster;
-	Transform worldtoscreen;
-	Transform worldtondc;
+	ProjectionTransform worldtoraster;
+	ProjectionTransform worldtoscreen;
+	ProjectionTransform worldtondc;
 	Transform worldtocamera;
 
-	Transform rastertocamera;
-	Transform cameratoraster;
+	ProjectionTransform rastertocamera;
+	ProjectionTransform cameratoraster;
 
 	float3 dx;
 	float3 dy;
@@ -163,19 +165,26 @@ public:
 	float3 full_dx;
 	float3 full_dy;
 
+	float3 frustum_right_normal;
+	float3 frustum_top_normal;
+
 	/* update */
 	bool need_update;
 	bool need_device_update;
 	bool need_flags_update;
 	int previous_need_motion;
 
+	/* Kernel camera data, copied here for dicing. */
+	KernelCamera kernel_camera;
+	array<DecomposedTransform> kernel_camera_motion;
+
 	/* functions */
 	Camera();
 	~Camera();
 	
 	void compute_auto_viewplane();
 
-	void update();
+	void update(Scene *scene);
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene);
 	void device_update_volume(Device *device, DeviceScene *dscene, Scene *scene);
@@ -191,6 +200,11 @@ public:
 	/* Calculates the width of a pixel at point in world space. */
 	float world_to_raster_size(float3 P);
 
+	/* Motion blur. */
+	float motion_time(int step) const;
+	int motion_step(float time) const;
+	bool use_motion() const;
+
 private:
 	/* Private utility functions. */
 	float3 transform_raster_to_world(float raster_x, float raster_y);
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index b7f25663bc3..943b218f0e4 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "constant_fold.h"
-#include "graph.h"
+#include "render/constant_fold.h"
+#include "render/graph.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp)
 		bypass(input->link);
 		return true;
 	}
+	else {
+		/* disconnect other inputs if we can't fully bypass due to clamp */
+		foreach(ShaderInput *other, node->inputs) {
+			if(other != input && other->link) {
+				graph->disconnect(other);
+			}
+		}
+	}
 
 	return false;
 }
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 7962698319f..33f93b8c0ab 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -17,8 +17,8 @@
 #ifndef __CONSTANT_FOLD_H__
 #define __CONSTANT_FOLD_H__
 
-#include "util_types.h"
-#include "svm_types.h"
+#include "util/util_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index f671eb19cae..4c085b928fb 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "device/device.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index e41967eebf5..8834764bd63 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -17,8 +17,8 @@
 #ifndef __CURVES_H__
 #define __CURVES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 923252bb375..69828cc78da 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_math_cdf.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_math_cdf.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -85,83 +84,81 @@ void Pass::add(PassType type, array<Pass>& passes)
 			pass.components = 1;
 			pass.filter = false;
 			break;
-		case PASS_DIFFUSE_COLOR:
-		case PASS_GLOSSY_COLOR:
-		case PASS_TRANSMISSION_COLOR:
-		case PASS_SUBSURFACE_COLOR:
-			pass.components = 4;
-			break;
-		case PASS_DIFFUSE_INDIRECT:
+
+		case PASS_EMISSION:
+		case PASS_BACKGROUND:
 			pass.components = 4;
 			pass.exposure = true;
-			pass.divide_type = PASS_DIFFUSE_COLOR;
 			break;
-		case PASS_GLOSSY_INDIRECT:
+		case PASS_AO:
 			pass.components = 4;
-			pass.exposure = true;
-			pass.divide_type = PASS_GLOSSY_COLOR;
 			break;
-		case PASS_TRANSMISSION_INDIRECT:
+		case PASS_SHADOW:
 			pass.components = 4;
-			pass.exposure = true;
-			pass.divide_type = PASS_TRANSMISSION_COLOR;
+			pass.exposure = false;
 			break;
-		case PASS_SUBSURFACE_INDIRECT:
+		case PASS_LIGHT:
+			/* This isn't a real pass, used by baking to see whether
+			 * light data is needed or not.
+			 *
+			 * Set components to 0 so pass sort below happens in a
+			 * determined way.
+			 */
+			pass.components = 0;
+			break;
+#ifdef WITH_CYCLES_DEBUG
+		case PASS_BVH_TRAVERSED_NODES:
+		case PASS_BVH_TRAVERSED_INSTANCES:
+		case PASS_BVH_INTERSECTIONS:
+		case PASS_RAY_BOUNCES:
+			pass.components = 1;
+			pass.exposure = false;
+			break;
+#endif
+		case PASS_RENDER_TIME:
+			/* This pass is handled entirely on the host side. */
+			pass.components = 0;
+			break;
+
+		case PASS_DIFFUSE_COLOR:
+		case PASS_GLOSSY_COLOR:
+		case PASS_TRANSMISSION_COLOR:
+		case PASS_SUBSURFACE_COLOR:
 			pass.components = 4;
-			pass.exposure = true;
-			pass.divide_type = PASS_SUBSURFACE_COLOR;
 			break;
 		case PASS_DIFFUSE_DIRECT:
+		case PASS_DIFFUSE_INDIRECT:
 			pass.components = 4;
 			pass.exposure = true;
 			pass.divide_type = PASS_DIFFUSE_COLOR;
 			break;
 		case PASS_GLOSSY_DIRECT:
+		case PASS_GLOSSY_INDIRECT:
 			pass.components = 4;
 			pass.exposure = true;
 			pass.divide_type = PASS_GLOSSY_COLOR;
 			break;
 		case PASS_TRANSMISSION_DIRECT:
+		case PASS_TRANSMISSION_INDIRECT:
 			pass.components = 4;
 			pass.exposure = true;
 			pass.divide_type = PASS_TRANSMISSION_COLOR;
 			break;
 		case PASS_SUBSURFACE_DIRECT:
+		case PASS_SUBSURFACE_INDIRECT:
 			pass.components = 4;
 			pass.exposure = true;
 			pass.divide_type = PASS_SUBSURFACE_COLOR;
 			break;
-
-		case PASS_EMISSION:
-		case PASS_BACKGROUND:
+		case PASS_VOLUME_DIRECT:
+		case PASS_VOLUME_INDIRECT:
 			pass.components = 4;
 			pass.exposure = true;
 			break;
-		case PASS_AO:
-			pass.components = 4;
-			break;
-		case PASS_SHADOW:
-			pass.components = 4;
-			pass.exposure = false;
-			break;
-		case PASS_LIGHT:
-			/* This isn't a real pass, used by baking to see whether
-			 * light data is needed or not.
-			 *
-			 * Set components to 0 so pass sort below happens in a
-			 * determined way.
-			 */
-			pass.components = 0;
-			break;
-#ifdef WITH_CYCLES_DEBUG
-		case PASS_BVH_TRAVERSED_NODES:
-		case PASS_BVH_TRAVERSED_INSTANCES:
-		case PASS_BVH_INTERSECTIONS:
-		case PASS_RAY_BOUNCES:
-			pass.components = 1;
-			pass.exposure = false;
+
+		default:
+			assert(false);
 			break;
-#endif
 	}
 
 	passes.push_back_slow(pass);
@@ -279,6 +276,10 @@ NODE_DEFINE(Film)
 
 	SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
 
+	SOCKET_BOOLEAN(denoising_data_pass,  "Generate Denoising Data Pass",  false);
+	SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+	SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
 	return type;
 }
 
@@ -309,12 +310,25 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	/* update __data */
 	kfilm->exposure = exposure;
 	kfilm->pass_flag = 0;
+	kfilm->light_pass_flag = 0;
 	kfilm->pass_stride = 0;
 	kfilm->use_light_pass = use_light_visibility || use_sample_clamp;
 
 	for(size_t i = 0; i < passes.size(); i++) {
 		Pass& pass = passes[i];
-		kfilm->pass_flag |= pass.type;
+
+		if(pass.type == PASS_NONE)
+			continue;
+
+		int pass_flag = (1 << (pass.type % 32));
+		if(pass.type <= PASS_CATEGORY_MAIN_END) {
+			kfilm->pass_flag |= pass_flag;
+		}
+		else {
+			assert(pass.type <= PASS_CATEGORY_LIGHT_END);
+			kfilm->use_light_pass = 1;
+			kfilm->light_pass_flag |= pass_flag;
+		}
 
 		switch(pass.type) {
 			case PASS_COMBINED:
@@ -323,10 +337,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 			case PASS_DEPTH:
 				kfilm->pass_depth = kfilm->pass_stride;
 				break;
-			case PASS_MIST:
-				kfilm->pass_mist = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
-				break;
 			case PASS_NORMAL:
 				kfilm->pass_normal = kfilm->pass_stride;
 				break;
@@ -345,74 +355,67 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 			case PASS_MATERIAL_ID:
 				kfilm->pass_material_id = kfilm->pass_stride;
 				break;
+
+			case PASS_MIST:
+				kfilm->pass_mist = kfilm->pass_stride;
+				break;
+			case PASS_EMISSION:
+				kfilm->pass_emission = kfilm->pass_stride;
+				break;
+			case PASS_BACKGROUND:
+				kfilm->pass_background = kfilm->pass_stride;
+				break;
+			case PASS_AO:
+				kfilm->pass_ao = kfilm->pass_stride;
+				break;
+			case PASS_SHADOW:
+				kfilm->pass_shadow = kfilm->pass_stride;
+				break;
+
+			case PASS_LIGHT:
+				break;
+
 			case PASS_DIFFUSE_COLOR:
 				kfilm->pass_diffuse_color = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_GLOSSY_COLOR:
 				kfilm->pass_glossy_color = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_TRANSMISSION_COLOR:
 				kfilm->pass_transmission_color = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_SUBSURFACE_COLOR:
 				kfilm->pass_subsurface_color = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_DIFFUSE_INDIRECT:
 				kfilm->pass_diffuse_indirect = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_GLOSSY_INDIRECT:
 				kfilm->pass_glossy_indirect = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_TRANSMISSION_INDIRECT:
 				kfilm->pass_transmission_indirect = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_SUBSURFACE_INDIRECT:
 				kfilm->pass_subsurface_indirect = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
+				break;
+			case PASS_VOLUME_INDIRECT:
+				kfilm->pass_volume_indirect = kfilm->pass_stride;
 				break;
 			case PASS_DIFFUSE_DIRECT:
 				kfilm->pass_diffuse_direct = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_GLOSSY_DIRECT:
 				kfilm->pass_glossy_direct = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_TRANSMISSION_DIRECT:
 				kfilm->pass_transmission_direct = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
 			case PASS_SUBSURFACE_DIRECT:
 				kfilm->pass_subsurface_direct = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
-				break;
-
-			case PASS_EMISSION:
-				kfilm->pass_emission = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
 				break;
-			case PASS_BACKGROUND:
-				kfilm->pass_background = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
-				break;
-			case PASS_AO:
-				kfilm->pass_ao = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
-				break;
-			case PASS_SHADOW:
-				kfilm->pass_shadow = kfilm->pass_stride;
-				kfilm->use_light_pass = 1;
-				break;
-
-			case PASS_LIGHT:
-				kfilm->use_light_pass = 1;
+			case PASS_VOLUME_DIRECT:
+				kfilm->pass_volume_direct = kfilm->pass_stride;
 				break;
 
 #ifdef WITH_CYCLES_DEBUG
@@ -429,14 +432,31 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 				kfilm->pass_ray_bounces = kfilm->pass_stride;
 				break;
 #endif
+			case PASS_RENDER_TIME:
+				break;
 
-			case PASS_NONE:
+			default:
+				assert(false);
 				break;
 		}
 
 		kfilm->pass_stride += pass.components;
 	}
 
+	kfilm->pass_denoising_data = 0;
+	kfilm->pass_denoising_clean = 0;
+	kfilm->denoising_flags = 0;
+	if(denoising_data_pass) {
+		kfilm->pass_denoising_data = kfilm->pass_stride;
+		kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+		kfilm->denoising_flags = denoising_flags;
+		if(denoising_clean_pass) {
+			kfilm->pass_denoising_clean = kfilm->pass_stride;
+			kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+			kfilm->use_light_pass = 1;
+		}
+	}
+
 	kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
 	kfilm->pass_alpha_threshold = pass_alpha_threshold;
 
@@ -451,6 +471,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
 	kfilm->mist_falloff = mist_falloff;
 
+	pass_stride = kfilm->pass_stride;
+	denoising_data_offset = kfilm->pass_denoising_data;
+	denoising_clean_offset = kfilm->pass_denoising_clean;
+
 	need_update = false;
 }
 
@@ -472,7 +496,7 @@ void Film::tag_passes_update(Scene *scene, const array<Pass>& passes_)
 		scene->mesh_manager->tag_update(scene);
 
 		foreach(Shader *shader, scene->shaders)
-			shader->need_update_attributes = true;
+			shader->need_update_mesh = true;
 	}
 	else if(Pass::contains(passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION))
 		scene->mesh_manager->tag_update(scene);
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 9fa51c51f52..29b1e7e9157 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,12 +17,12 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,12 +53,19 @@ public:
 
 class Film : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float exposure;
 	array<Pass> passes;
+	bool denoising_data_pass;
+	bool denoising_clean_pass;
+	int denoising_flags;
 	float pass_alpha_threshold;
 
+	int pass_stride;
+	int denoising_data_offset;
+	int denoising_clean_offset;
+
 	FilterType filter_type;
 	float filter_width;
 	size_t filter_table_offset;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index f6c83fb5c7e..096de878e51 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
-#include "constant_fold.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_queue.h"
-#include "util_logging.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/constant_fold.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -195,6 +196,7 @@ bool ShaderNode::equals(const ShaderNode& other)
 ShaderGraph::ShaderGraph()
 {
 	finalized = false;
+	simplified = false;
 	num_node_ids = 0;
 	add(new OutputNode());
 }
@@ -207,6 +209,8 @@ ShaderGraph::~ShaderGraph()
 ShaderNode *ShaderGraph::add(ShaderNode *node)
 {
 	assert(!finalized);
+	simplified = false;
+
 	node->id = num_node_ids++;
 	nodes.push_back(node);
 	return node;
@@ -217,26 +221,6 @@ OutputNode *ShaderGraph::output()
 	return (OutputNode*)nodes.front();
 }
 
-ShaderGraph *ShaderGraph::copy()
-{
-	ShaderGraph *newgraph = new ShaderGraph();
-
-	/* copy nodes */
-	ShaderNodeSet nodes_all;
-	foreach(ShaderNode *node, nodes)
-		nodes_all.insert(node);
-
-	ShaderNodeMap nodes_copy;
-	copy_nodes(nodes_all, nodes_copy);
-
-	/* add nodes (in same order, so output is still first) */
-	newgraph->clear_nodes();
-	foreach(ShaderNode *node, nodes)
-		newgraph->add(nodes_copy[node]);
-
-	return newgraph;
-}
-
 void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 {
 	assert(!finalized);
@@ -273,6 +257,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 void ShaderGraph::disconnect(ShaderOutput *from)
 {
 	assert(!finalized);
+	simplified = false;
 
 	foreach(ShaderInput *sock, from->links) {
 		sock->link = NULL;
@@ -285,6 +270,7 @@ void ShaderGraph::disconnect(ShaderInput *to)
 {
 	assert(!finalized);
 	assert(to->link);
+	simplified = false;
 
 	ShaderOutput *from = to->link;
 
@@ -294,6 +280,8 @@ void ShaderGraph::disconnect(ShaderInput *to)
 
 void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 {
+	simplified = false;
+
 	/* Copy because disconnect modifies this list */
 	vector<ShaderInput*> outputs = from->links;
 
@@ -310,9 +298,19 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 	}
 }
 
+void ShaderGraph::simplify(Scene *scene)
+{
+	if(!simplified) {
+		default_inputs(scene->shader_manager->use_osl());
+		clean(scene);
+		refine_bump_nodes();
+
+		simplified = true;
+	}
+}
+
 void ShaderGraph::finalize(Scene *scene,
                            bool do_bump,
-                           bool do_osl,
                            bool do_simplify,
                            bool bump_in_object_space)
 {
@@ -322,9 +320,7 @@ void ShaderGraph::finalize(Scene *scene,
 	 * modified afterwards. */
 
 	if(!finalized) {
-		default_inputs(do_osl);
-		clean(scene);
-		refine_bump_nodes();
+		simplify(scene);
 
 		if(do_bump)
 			bump_from_displacement(bump_in_object_space);
@@ -405,7 +401,8 @@ void ShaderGraph::copy_nodes(ShaderNodeSet& nodes, ShaderNodeMap& nnodemap)
 /* Graph simplification */
 /* ******************** */
 
-/* Step 1: Remove proxy nodes.
+/* Remove proxy nodes.
+ *
  * These only exists temporarily when exporting groups, and we must remove them
  * early so that node->attributes() and default links do not see them.
  */
@@ -475,7 +472,8 @@ void ShaderGraph::remove_proxy_nodes()
 	}
 }
 
-/* Step 2: Constant folding.
+/* Constant folding.
+ *
  * Try to constant fold some nodes, and pipe result directly to
  * the input socket of connected nodes.
  */
@@ -529,14 +527,14 @@ void ShaderGraph::constant_fold()
 	 * that happens to ensure there is still a valid graph for displacement.
 	 */
 	if(has_displacement && !output()->input("Displacement")->link) {
-		ValueNode *value = (ValueNode*)add(new ValueNode());
+		ColorNode *value = (ColorNode*)add(new ColorNode());
 		value->value = output()->displacement;
 
-		connect(value->output("Value"), output()->input("Displacement"));
+		connect(value->output("Color"), output()->input("Displacement"));
 	}
 }
 
-/* Step 3: Simplification. */
+/* Simplification. */
 void ShaderGraph::simplify_settings(Scene *scene)
 {
 	foreach(ShaderNode *node, nodes) {
@@ -544,7 +542,7 @@ void ShaderGraph::simplify_settings(Scene *scene)
 	}
 }
 
-/* Step 4: Deduplicate nodes with same settings. */
+/* Deduplicate nodes with same settings. */
 void ShaderGraph::deduplicate_nodes()
 {
 	/* NOTES:
@@ -620,6 +618,48 @@ void ShaderGraph::deduplicate_nodes()
 	}
 }
 
+/* Check whether volume output has meaningful nodes, otherwise
+ * disconnect the output.
+ */
+void ShaderGraph::verify_volume_output()
+{
+	/* Check whether we can optimize the whole volume graph out. */
+	ShaderInput *volume_in = output()->input("Volume");
+	if(volume_in->link == NULL) {
+		return;
+	}
+	bool has_valid_volume = false;
+	ShaderNodeSet scheduled;
+	queue<ShaderNode*> traverse_queue;
+	/* Schedule volume output. */
+	traverse_queue.push(volume_in->link->parent);
+	scheduled.insert(volume_in->link->parent);
+	/* Traverse down the tree. */
+	while(!traverse_queue.empty()) {
+		ShaderNode *node = traverse_queue.front();
+		traverse_queue.pop();
+		/* Node is fully valid for volume, can't optimize anything out. */
+		if(node->has_volume_support()) {
+			has_valid_volume = true;
+			break;
+		}
+		foreach(ShaderInput *input, node->inputs) {
+			if(input->link == NULL) {
+				continue;
+			}
+			if(scheduled.find(input->link->parent) != scheduled.end()) {
+				continue;
+			}
+			traverse_queue.push(input->link->parent);
+			scheduled.insert(input->link->parent);
+		}
+	}
+	if(!has_valid_volume) {
+		VLOG(1) << "Disconnect meaningless volume output.";
+		disconnect(volume_in->link);
+	}
+}
+
 void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack)
 {
 	visited[node->id] = true;
@@ -644,20 +684,41 @@ void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<b
 	on_stack[node->id] = false;
 }
 
+void ShaderGraph::compute_displacement_hash()
+{
+	/* Compute hash of all nodes linked to displacement, to detect if we need
+	 * to recompute displacement when shader nodes change. */
+	ShaderInput *displacement_in = output()->input("Displacement");
+
+	if(!displacement_in->link) {
+		displacement_hash = "";
+		return;
+	}
+
+	ShaderNodeSet nodes_displace;
+	find_dependencies(nodes_displace, displacement_in);
+
+	MD5Hash md5;
+	foreach(ShaderNode *node, nodes_displace) {
+		node->hash(md5);
+		foreach(ShaderInput *input, node->inputs) {
+			int link_id = (input->link) ? input->link->parent->id : 0;
+			md5.append((uint8_t*)&link_id, sizeof(link_id));
+		}
+	}
+
+	displacement_hash = md5.get_hex();
+}
+
 void ShaderGraph::clean(Scene *scene)
 {
 	/* Graph simplification */
 
-	/* 1: Remove proxy nodes was already done. */
-
-	/* 2: Constant folding. */
+	/* NOTE: Remove proxy nodes was already done. */
 	constant_fold();
-
-	/* 3: Simplification. */
 	simplify_settings(scene);
-
-	/* 4: De-duplication. */
 	deduplicate_nodes();
+	verify_volume_output();
 
 	/* we do two things here: find cycles and break them, and remove unused
 	 * nodes that don't feed into the output. how cycles are broken is
@@ -827,7 +888,7 @@ void ShaderGraph::bump_from_displacement(bool use_object_space)
 
 	if(!displacement_in->link)
 		return;
-	
+
 	/* find dependencies for the given input */
 	ShaderNodeSet nodes_displace;
 	find_dependencies(nodes_displace, displacement_in);
@@ -859,15 +920,34 @@ void ShaderGraph::bump_from_displacement(bool use_object_space)
 	/* add bump node and connect copied graphs to it */
 	BumpNode *bump = (BumpNode*)add(new BumpNode());
 	bump->use_object_space = use_object_space;
+	bump->distance = 1.0f;
 
 	ShaderOutput *out = displacement_in->link;
 	ShaderOutput *out_center = nodes_center[out->parent]->output(out->name());
 	ShaderOutput *out_dx = nodes_dx[out->parent]->output(out->name());
 	ShaderOutput *out_dy = nodes_dy[out->parent]->output(out->name());
 
-	connect(out_center, bump->input("SampleCenter"));
-	connect(out_dx, bump->input("SampleX"));
-	connect(out_dy, bump->input("SampleY"));
+	/* convert displacement vector to height */
+	VectorMathNode *dot_center = (VectorMathNode*)add(new VectorMathNode());
+	VectorMathNode *dot_dx = (VectorMathNode*)add(new VectorMathNode());
+	VectorMathNode *dot_dy = (VectorMathNode*)add(new VectorMathNode());
+
+	dot_center->type = NODE_VECTOR_MATH_DOT_PRODUCT;
+	dot_dx->type = NODE_VECTOR_MATH_DOT_PRODUCT;
+	dot_dy->type = NODE_VECTOR_MATH_DOT_PRODUCT;
+
+	GeometryNode *geom = (GeometryNode*)add(new GeometryNode());
+	connect(geom->output("Normal"), dot_center->input("Vector2"));
+	connect(geom->output("Normal"), dot_dx->input("Vector2"));
+	connect(geom->output("Normal"), dot_dy->input("Vector2"));
+
+	connect(out_center, dot_center->input("Vector1"));
+	connect(out_dx, dot_dx->input("Vector1"));
+	connect(out_dy, dot_dy->input("Vector1"));
+
+	connect(dot_center->output("Value"), bump->input("SampleCenter"));
+	connect(dot_dx->output("Value"), bump->input("SampleX"));
+	connect(dot_dy->output("Value"), bump->input("SampleY"));
 	
 	/* connect the bump out to the set normal in: */
 	connect(bump->output("Normal"), set_normal->input("Direction"));
@@ -980,6 +1060,12 @@ int ShaderGraph::get_num_closures()
 		else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) {
 			num_closures += 2;
 		}
+		else if(CLOSURE_IS_PRINCIPLED(closure_type)) {
+			num_closures += 8;
+		}
+		else if(CLOSURE_IS_VOLUME(closure_type)) {
+			num_closures += VOLUME_STACK_SIZE;
+		}
 		else {
 			++num_closures;
 		}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 780fdf49ca4..2c134932b3c 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -17,17 +17,17 @@
 #ifndef __GRAPH_H__
 #define __GRAPH_H__
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_set.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_set.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,6 +42,7 @@ class SVMCompiler;
 class OSLCompiler;
 class OutputNode;
 class ConstantFolder;
+class MD5Hash;
 
 /* Bump
  *
@@ -151,11 +152,14 @@ public:
 	virtual bool has_surface_emission() { return false; }
 	virtual bool has_surface_transparent() { return false; }
 	virtual bool has_surface_bssrdf() { return false; }
+	virtual bool has_bump() { return false; }
 	virtual bool has_bssrdf_bump() { return false; }
 	virtual bool has_spatial_varying() { return false; }
 	virtual bool has_object_dependency() { return false; }
+	virtual bool has_attribute_dependency() { return false; }
 	virtual bool has_integrator_dependency() { return false; }
-
+	virtual bool has_volume_support() { return false; }
+	virtual bool has_raytrace() { return false; }
 	vector<ShaderInput*> inputs;
 	vector<ShaderOutput*> outputs;
 
@@ -201,14 +205,14 @@ public:
 /* Node definition utility macros */
 
 #define SHADER_NODE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual ShaderNode *clone() const { return new type(*this); } \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
 
 #define SHADER_NODE_NO_CLONE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
@@ -240,12 +244,12 @@ public:
 	list<ShaderNode*> nodes;
 	size_t num_node_ids;
 	bool finalized;
+	bool simplified;
+	string displacement_hash;
 
 	ShaderGraph();
 	~ShaderGraph();
 
-	ShaderGraph *copy();
-
 	ShaderNode *add(ShaderNode *node);
 	OutputNode *output();
 
@@ -255,9 +259,10 @@ public:
 	void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to);
 
 	void remove_proxy_nodes();
+	void compute_displacement_hash();
+	void simplify(Scene *scene);
 	void finalize(Scene *scene,
 	              bool do_bump = false,
-	              bool do_osl = false,
 	              bool do_simplify = false,
 	              bool bump_in_object_space = false);
 
@@ -283,6 +288,7 @@ protected:
 	void constant_fold();
 	void simplify_settings(Scene *scene);
 	void deduplicate_nodes();
+	void verify_volume_output();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index fd8a1262208..9c5e32e8219 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "image.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/image.h"
+#include "render/scene.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_texture.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_texture.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslexec.h>
@@ -30,73 +30,29 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Some helpers to silence warning in templated function. */
+static bool isfinite(uchar /*value*/)
+{
+	return false;
+}
+static bool isfinite(half /*value*/)
+{
+	return false;
+}
+
 ImageManager::ImageManager(const DeviceInfo& info)
 {
 	need_update = true;
-	pack_images = false;
 	osl_texture_system = NULL;
 	animation_frame = 0;
 
-	/* In case of multiple devices used we need to know type of an actual
-	 * compute device.
-	 *
-	 * NOTE: We assume that all the devices are same type, otherwise we'll
-	 * be screwed on so many levels..
-	 */
-	DeviceType device_type = info.type;
-	if(device_type == DEVICE_MULTI) {
-		device_type = info.multi_devices[0].type;
-	}
-
 	/* Set image limits */
-#define SET_TEX_IMAGES_LIMITS(ARCH) \
-	{ \
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_HALF4] = TEX_NUM_HALF4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_BYTE] = TEX_NUM_BYTE_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_HALF] = TEX_NUM_HALF_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_START_FLOAT4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_BYTE4] = TEX_START_BYTE4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_HALF4] = TEX_START_HALF4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT] = TEX_START_FLOAT_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_BYTE] = TEX_START_BYTE_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_HALF] = TEX_START_HALF_ ## ARCH; \
-	}
+	max_num_images = TEX_NUM_MAX;
+	has_half_images = info.has_half_images;
 
-	if(device_type == DEVICE_CPU) {
-		SET_TEX_IMAGES_LIMITS(CPU);
-	}
-	else if(device_type == DEVICE_CUDA) {
-		if(info.has_bindless_textures) {
-			SET_TEX_IMAGES_LIMITS(CUDA_KEPLER);
-		}
-		else {
-			SET_TEX_IMAGES_LIMITS(CUDA);
-		}
-	}
-	else if(device_type == DEVICE_OPENCL) {
-		SET_TEX_IMAGES_LIMITS(OPENCL);
-	}
-	else {
-		/* Should not happen. */
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_BYTE4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_HALF4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_BYTE] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_HALF] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_BYTE4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_HALF4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_BYTE] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_HALF] = 0;
-		assert(0);
+	for(size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		tex_num_images[type] = 0;
 	}
-
-#undef SET_TEX_IMAGES_LIMITS
 }
 
 ImageManager::~ImageManager()
@@ -107,11 +63,6 @@ ImageManager::~ImageManager()
 	}
 }
 
-void ImageManager::set_pack_images(bool pack_images_)
-{
-	pack_images = pack_images_;
-}
-
 void ImageManager::set_osl_texture_system(void *texture_system)
 {
 	osl_texture_system = texture_system;
@@ -133,109 +84,143 @@ bool ImageManager::set_animation_frame_update(int frame)
 	return false;
 }
 
-ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filename,
-                                                             void *builtin_data,
-                                                             bool& is_linear)
+device_memory *ImageManager::image_memory(int flat_slot)
+{
+	   ImageDataType type;
+	   int slot = flattened_slot_to_type_index(flat_slot, &type);
+
+	   Image *img = images[type][slot];
+
+	   return img->mem;
+}
+
+bool ImageManager::get_image_metadata(const string& filename,
+                                      void *builtin_data,
+                                      ImageMetaData& metadata)
 {
-	bool is_float = false, is_half = false;
-	is_linear = false;
-	int channels = 4;
+	memset(&metadata, 0, sizeof(metadata));
 
 	if(builtin_data) {
 		if(builtin_image_info_cb) {
-			int width, height, depth;
-			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels);
+			builtin_image_info_cb(filename, builtin_data, metadata);
+		}
+		else {
+			return false;
 		}
 
-		if(is_float) {
-			is_linear = true;
-			return (channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+		if(metadata.is_float) {
+			metadata.is_linear = true;
+			metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
 		}
 		else {
-			return (channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+			metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
 		}
+
+		return true;
+	}
+
+	/* Perform preliminary checks, with meaningful logging. */
+	if(!path_exists(filename)) {
+		VLOG(1) << "File '" << filename << "' does not exist.";
+		return false;
+	}
+	if(path_is_directory(filename)) {
+		VLOG(1) << "File '" << filename << "' is a directory, can't use as image.";
+		return false;
 	}
 
 	ImageInput *in = ImageInput::create(filename);
 
-	if(in) {
-		ImageSpec spec;
-
-		if(in->open(filename, spec)) {
-			/* check the main format, and channel formats;
-			 * if any take up more than one byte, we'll need a float texture slot */
-			if(spec.format.basesize() > 1) {
-				is_float = true;
-				is_linear = true;
-			}
+	if(!in) {
+		return false;
+	}
 
-			for(size_t channel = 0; channel < spec.channelformats.size(); channel++) {
-				if(spec.channelformats[channel].basesize() > 1) {
-					is_float = true;
-					is_linear = true;
-				}
-			}
+	ImageSpec spec;
+	if(!in->open(filename, spec)) {
+		delete in;
+		return false;
+	}
 
-			/* check if it's half float */
-			if(spec.format == TypeDesc::HALF)
-				is_half = true;
+	metadata.width = spec.width;
+	metadata.height = spec.height;
+	metadata.depth = spec.depth;
 
-			channels = spec.nchannels;
+	/* check the main format, and channel formats;
+	 * if any take up more than one byte, we'll need a float texture slot */
+	if(spec.format.basesize() > 1) {
+		metadata.is_float = true;
+		metadata.is_linear = true;
+	}
 
-			/* basic color space detection, not great but better than nothing
-			 * before we do OpenColorIO integration */
-			if(is_float) {
-				string colorspace = spec.get_string_attribute("oiio:ColorSpace");
+	for(size_t channel = 0; channel < spec.channelformats.size(); channel++) {
+		if(spec.channelformats[channel].basesize() > 1) {
+			metadata.is_float = true;
+			metadata.is_linear = true;
+		}
+	}
 
-				is_linear = !(colorspace == "sRGB" ||
-				              colorspace == "GammaCorrected" ||
-				              (colorspace == "" &&
-				                  (strcmp(in->format_name(), "png") == 0 ||
-				                   strcmp(in->format_name(), "tiff") == 0 ||
-				                   strcmp(in->format_name(), "dpx") == 0 ||
-				                   strcmp(in->format_name(), "jpeg2000") == 0)));
-			}
-			else {
-				is_linear = false;
-			}
+	/* check if it's half float */
+	if(spec.format == TypeDesc::HALF)
+		metadata.is_half = true;
 
-			in->close();
-		}
+	/* basic color space detection, not great but better than nothing
+	 * before we do OpenColorIO integration */
+	if(metadata.is_float) {
+		string colorspace = spec.get_string_attribute("oiio:ColorSpace");
 
-		delete in;
+		metadata.is_linear = !(colorspace == "sRGB" ||
+							   colorspace == "GammaCorrected" ||
+							   (colorspace == "" &&
+								   (strcmp(in->format_name(), "png") == 0 ||
+									strcmp(in->format_name(), "tiff") == 0 ||
+									strcmp(in->format_name(), "dpx") == 0 ||
+									strcmp(in->format_name(), "jpeg2000") == 0)));
+	}
+	else {
+		metadata.is_linear = false;
 	}
 
-	if(is_half) {
-		return (channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
+	/* set type and channels */
+	metadata.channels = spec.nchannels;
+
+	if(metadata.is_half) {
+		metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
 	}
-	else if(is_float) {
-		return (channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+	else if(metadata.is_float) {
+		metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
 	}
 	else {
-		return (channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+		metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+	}
+
+	in->close();
+	delete in;
+
+	return true;
+}
+
+int ImageManager::max_flattened_slot(ImageDataType type)
+{
+	if(tex_num_images[type] == 0) {
+		/* No textures for the type, no slots needs allocation. */
+		return 0;
 	}
+	return type_index_to_flattened_slot(tex_num_images[type], type);
 }
 
-/* We use a consecutive slot counting scheme on the devices, in order
- * float4, byte4, half4, float, byte, half.
+/* The lower three bits of a device texture slot number indicate its type.
  * These functions convert the slot ids from ImageManager "images" ones
- * to device ones and vice versa. */
+ * to device ones and vice verse.
+ */
 int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type)
 {
-	return slot + tex_start_images[type];
+	return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
 }
 
 int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
 {
-	for(int i = IMAGE_DATA_NUM_TYPES - 1; i >= 0; i--) {
-		if(flat_slot >= tex_start_images[i]) {
-			*type = (ImageDataType)i;
-			return flat_slot - tex_start_images[i];
-		}
-	}
-
-	/* Should not happen. */
-	return flat_slot;
+	*type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
+	return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
 }
 
 string ImageManager::name_from_type(int type)
@@ -272,32 +257,27 @@ int ImageManager::add_image(const string& filename,
                             void *builtin_data,
                             bool animated,
                             float frame,
-                            bool& is_float,
-                            bool& is_linear,
                             InterpolationType interpolation,
                             ExtensionType extension,
-                            bool use_alpha)
+                            bool use_alpha,
+                            ImageMetaData& metadata)
 {
 	Image *img;
 	size_t slot;
 
-	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear);
+	get_image_metadata(filename, builtin_data, metadata);
+	ImageDataType type = metadata.type;
 
 	thread_scoped_lock device_lock(device_mutex);
 
-	/* Do we have a float? */
-	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
-		is_float = true;
-
-	/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
-	if((type == IMAGE_DATA_TYPE_FLOAT ||
-	    type == IMAGE_DATA_TYPE_HALF4 ||
-	    type == IMAGE_DATA_TYPE_HALF) &&
-	    tex_num_images[type] == 0) {
-		type = IMAGE_DATA_TYPE_FLOAT4;
-	}
-	if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) {
-		type = IMAGE_DATA_TYPE_BYTE4;
+	/* No half textures on OpenCL, use full float instead. */
+	if(!has_half_images) {
+		if(type == IMAGE_DATA_TYPE_HALF4) {
+			type = IMAGE_DATA_TYPE_FLOAT4;
+		}
+		else if(type == IMAGE_DATA_TYPE_HALF) {
+			type = IMAGE_DATA_TYPE_FLOAT;
+		}
 	}
 
 	/* Fnd existing image. */
@@ -329,14 +309,19 @@ int ImageManager::add_image(const string& filename,
 			break;
 	}
 
-	if(slot == images[type].size()) {
-		/* Max images limit reached. */
-		if(images[type].size() == tex_num_images[type]) {
-			printf("ImageManager::add_image: Reached %s image limit (%d), skipping '%s'\n",
-			       name_from_type(type).c_str(), tex_num_images[type], filename.c_str());
-			return -1;
-		}
+	/* Count if we're over the limit.
+	 * Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
+	int tex_count = 0;
+	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		tex_count += tex_num_images[type];
+	}
+	if(tex_count > max_num_images) {
+		printf("ImageManager::add_image: Reached image limit (%d), skipping '%s'\n",
+			max_num_images, filename.c_str());
+		return -1;
+	}
 
+	if(slot == images[type].size()) {
 		images[type].resize(images[type].size() + 1);
 	}
 
@@ -344,6 +329,7 @@ int ImageManager::add_image(const string& filename,
 	img = new Image();
 	img->filename = filename;
 	img->builtin_data = builtin_data;
+	img->builtin_free_cache = metadata.builtin_free_cache;
 	img->need_load = true;
 	img->animated = animated;
 	img->frame = frame;
@@ -351,9 +337,12 @@ int ImageManager::add_image(const string& filename,
 	img->extension = extension;
 	img->users = 1;
 	img->use_alpha = use_alpha;
+	img->mem = NULL;
 
 	images[type][slot] = img;
 
+	++tex_num_images[type];
+
 	need_update = true;
 
 	return type_index_to_flattened_slot(slot, type);
@@ -427,12 +416,22 @@ void ImageManager::tag_reload_image(const string& filename,
 	}
 }
 
-bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components)
+bool ImageManager::file_load_image_generic(Image *img,
+                                           ImageInput **in,
+                                           int &width,
+                                           int &height,
+                                           int &depth,
+                                           int &components)
 {
 	if(img->filename == "")
 		return false;
 
 	if(!img->builtin_data) {
+		/* NOTE: Error logging is done in meta data acquisition. */
+		if(!path_exists(img->filename) || path_is_directory(img->filename)) {
+			return false;
+		}
+
 		/* load image from file through OIIO */
 		*in = ImageInput::create(img->filename);
 
@@ -461,8 +460,13 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 		if(!builtin_image_info_cb || !builtin_image_pixels_cb)
 			return false;
 
-		bool is_float;
-		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
+		ImageMetaData metadata;
+		builtin_image_info_cb(img->filename, img->builtin_data, metadata);
+
+		width = metadata.width;
+		height = metadata.height;
+		depth = metadata.depth;
+		components = metadata.channels;
 	}
 
 	/* we only handle certain number of components */
@@ -497,12 +501,21 @@ bool ImageManager::file_load_image(Image *img,
 	vector<StorageType> pixels_storage;
 	StorageType *pixels;
 	const size_t max_size = max(max(width, height), depth);
+	if(max_size == 0) {
+		/* Don't bother with invalid images. */
+		return false;
+	}
 	if(texture_limit > 0 && max_size > texture_limit) {
 		pixels_storage.resize(((size_t)width)*height*depth*4);
 		pixels = &pixels_storage[0];
 	}
 	else {
-		pixels = (StorageType*)tex_img.resize(width, height, depth);
+		thread_scoped_lock device_lock(device_mutex);
+		pixels = (StorageType*)tex_img.alloc(width, height, depth);
+	}
+	if(pixels == NULL) {
+		/* Could be that we've run out of memory. */
+		return false;
 	}
 	bool cmyk = false;
 	const size_t num_pixels = ((size_t)width) * height * depth;
@@ -543,13 +556,15 @@ bool ImageManager::file_load_image(Image *img,
 			builtin_image_float_pixels_cb(img->filename,
 			                              img->builtin_data,
 			                              (float*)&pixels[0],
-			                              num_pixels * components);
+			                              num_pixels * components,
+			                              img->builtin_free_cache);
 		}
 		else if(FileFormat == TypeDesc::UINT8) {
 			builtin_image_pixels_cb(img->filename,
 			                        img->builtin_data,
 			                        (uchar*)&pixels[0],
-			                        num_pixels * components);
+			                        num_pixels * components,
+			                        img->builtin_free_cache);
 		}
 		else {
 			/* TODO(dingto): Support half for ImBuf. */
@@ -604,6 +619,37 @@ bool ImageManager::file_load_image(Image *img,
 			}
 		}
 	}
+	/* Make sure we don't have buggy values. */
+	if(FileFormat == TypeDesc::FLOAT) {
+		/* For RGBA buffers we put all channels to 0 if either of them is not
+		 * finite. This way we avoid possible artifacts caused by fully changed
+		 * hue.
+		 */
+		if(is_rgba) {
+			for(size_t i = 0; i < num_pixels; i += 4) {
+				StorageType *pixel = &pixels[i*4];
+				if(!isfinite(pixel[0]) ||
+				   !isfinite(pixel[1]) ||
+				   !isfinite(pixel[2]) ||
+				   !isfinite(pixel[3]))
+				{
+					pixel[0] = 0;
+					pixel[1] = 0;
+					pixel[2] = 0;
+					pixel[3] = 0;
+				}
+			}
+		}
+		else {
+			for(size_t i = 0; i < num_pixels; ++i) {
+				StorageType *pixel = &pixels[i];
+				if(!isfinite(pixel[0])) {
+					pixel[0] = 0;
+				}
+			}
+		}
+	}
+	/* Scale image down if needed. */
 	if(pixels_storage.size() > 0) {
 		float scale_factor = 1.0f;
 		while(max_size * scale_factor > texture_limit) {
@@ -619,9 +665,16 @@ bool ImageManager::file_load_image(Image *img,
 		                         scale_factor,
 		                         &scaled_pixels,
 		                         &scaled_width, &scaled_height, &scaled_depth);
-		StorageType *texture_pixels = (StorageType*)tex_img.resize(scaled_width,
-		                                                           scaled_height,
-		                                                           scaled_depth);
+
+		StorageType *texture_pixels;
+
+		{
+			thread_scoped_lock device_lock(device_mutex);
+			texture_pixels = (StorageType*)tex_img.alloc(scaled_width,
+			                                             scaled_height,
+			                                             scaled_depth);
+		}
+
 		memcpy(texture_pixels,
 		       &scaled_pixels[0],
 		       scaled_pixels.size() * sizeof(StorageType));
@@ -630,7 +683,6 @@ bool ImageManager::file_load_image(Image *img,
 }
 
 void ImageManager::device_load_image(Device *device,
-                                     DeviceScene *dscene,
                                      Scene *scene,
                                      ImageDataType type,
                                      int slot,
@@ -651,30 +703,28 @@ void ImageManager::device_load_image(Device *device,
 
 	/* Slot assignment */
 	int flat_slot = type_index_to_flattened_slot(slot, type);
+	img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type).c_str(), flat_slot);
 
-	string name;
-	if(flat_slot >= 100)
-		name = string_printf("__tex_image_%s_%d", name_from_type(type).c_str(), flat_slot);
-	else if(flat_slot >= 10)
-		name = string_printf("__tex_image_%s_0%d", name_from_type(type).c_str(), flat_slot);
-	else
-		name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot);
+	/* Free previous texture in slot. */
+	if(img->mem) {
+		thread_scoped_lock device_lock(device_mutex);
+		delete img->mem;
+		img->mem = NULL;
+	}
 
+	/* Create new texture. */
 	if(type == IMAGE_DATA_TYPE_FLOAT4) {
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		device_vector<float4> *tex_img
+			= new device_vector<float4>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::FLOAT, float>(img,
 		                                            type,
 		                                            texture_limit,
-		                                            tex_img))
+		                                            *tex_img))
 		{
 			/* on failure to load, we set a 1x1 pixels pink image */
-			float *pixels = (float*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			float *pixels = (float*)tex_img->alloc(1, 1);
 
 			pixels[0] = TEX_IMAGE_MISSING_R;
 			pixels[1] = TEX_IMAGE_MISSING_G;
@@ -682,56 +732,48 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
+
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
 	}
 	else if(type == IMAGE_DATA_TYPE_FLOAT) {
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		device_vector<float> *tex_img
+			= new device_vector<float>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::FLOAT, float>(img,
 		                                            type,
 		                                            texture_limit,
-		                                            tex_img))
+		                                            *tex_img))
 		{
 			/* on failure to load, we set a 1x1 pixels pink image */
-			float *pixels = (float*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			float *pixels = (float*)tex_img->alloc(1, 1);
 
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
+
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
 	}
 	else if(type == IMAGE_DATA_TYPE_BYTE4) {
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		device_vector<uchar4> *tex_img
+			= new device_vector<uchar4>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::UINT8, uchar>(img,
 		                                            type,
 		                                            texture_limit,
-		                                            tex_img))
+		                                            *tex_img))
 		{
 			/* on failure to load, we set a 1x1 pixels pink image */
-			uchar *pixels = (uchar*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			uchar *pixels = (uchar*)tex_img->alloc(1, 1);
 
 			pixels[0] = (TEX_IMAGE_MISSING_R * 255);
 			pixels[1] = (TEX_IMAGE_MISSING_G * 255);
@@ -739,54 +781,46 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = (TEX_IMAGE_MISSING_A * 255);
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
-	}
-	else if(type == IMAGE_DATA_TYPE_BYTE){
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
 
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
+	}
+	else if(type == IMAGE_DATA_TYPE_BYTE) {
+		device_vector<uchar> *tex_img
+			= new device_vector<uchar>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::UINT8, uchar>(img,
 		                                            type,
 		                                            texture_limit,
-		                                            tex_img)) {
+		                                            *tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
-			uchar *pixels = (uchar*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			uchar *pixels = (uchar*)tex_img->alloc(1, 1);
 
 			pixels[0] = (TEX_IMAGE_MISSING_R * 255);
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
-	}
-	else if(type == IMAGE_DATA_TYPE_HALF4){
-		device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
 
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
+	}
+	else if(type == IMAGE_DATA_TYPE_HALF4) {
+		device_vector<half4> *tex_img
+			= new device_vector<half4>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::HALF, half>(img,
 		                                          type,
 		                                          texture_limit,
-		                                          tex_img)) {
+		                                          *tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
-			half *pixels = (half*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			half *pixels = (half*)tex_img->alloc(1, 1);
 
 			pixels[0] = TEX_IMAGE_MISSING_R;
 			pixels[1] = TEX_IMAGE_MISSING_G;
@@ -794,45 +828,40 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
-	}
-	else if(type == IMAGE_DATA_TYPE_HALF){
-		device_vector<half>& tex_img = dscene->tex_half_image[slot];
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
 
-		if(tex_img.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
-		}
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
+	}
+	else if(type == IMAGE_DATA_TYPE_HALF) {
+		device_vector<half> *tex_img
+			= new device_vector<half>(device, img->mem_name.c_str(), MEM_TEXTURE);
 
 		if(!file_load_image<TypeDesc::HALF, half>(img,
 		                                          type,
 		                                          texture_limit,
-		                                          tex_img)) {
+		                                          *tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
-			half *pixels = (half*)tex_img.resize(1, 1);
+			thread_scoped_lock device_lock(device_mutex);
+			half *pixels = (half*)tex_img->alloc(1, 1);
 
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(),
-			                  tex_img,
-			                  img->interpolation,
-			                  img->extension);
-		}
+		img->mem = tex_img;
+		img->mem->interpolation = img->interpolation;
+		img->mem->extension = img->extension;
+
+		thread_scoped_lock device_lock(device_mutex);
+		tex_img->copy_to_device();
 	}
 
 	img->need_load = false;
 }
 
-void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot)
+void ImageManager::device_free_image(Device *, ImageDataType type, int slot)
 {
 	Image *img = images[type][slot];
 
@@ -843,96 +872,40 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD
 			((OSL::TextureSystem*)osl_texture_system)->invalidate(filename);
 #endif
 		}
-		else if(type == IMAGE_DATA_TYPE_FLOAT4) {
-			device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_FLOAT) {
-			device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_BYTE4) {
-			device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_BYTE){
-			device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
 
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_HALF4){
-			device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_HALF){
-			device_vector<half>& tex_img = dscene->tex_half_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
+		if(img->mem) {
+			thread_scoped_lock device_lock(device_mutex);
+			delete img->mem;
 		}
 
-		delete images[type][slot];
+		delete img;
 		images[type][slot] = NULL;
+		--tex_num_images[type];
 	}
 }
 
 void ImageManager::device_update(Device *device,
-                                 DeviceScene *dscene,
                                  Scene *scene,
                                  Progress& progress)
 {
-	if(!need_update)
+	if(!need_update) {
 		return;
+	}
 
 	TaskPool pool;
-
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 		for(size_t slot = 0; slot < images[type].size(); slot++) {
 			if(!images[type][slot])
 				continue;
 
 			if(images[type][slot]->users == 0) {
-				device_free_image(device, dscene, (ImageDataType)type, slot);
+				device_free_image(device, (ImageDataType)type, slot);
 			}
 			else if(images[type][slot]->need_load) {
 				if(!osl_texture_system || images[type][slot]->builtin_data)
 					pool.push(function_bind(&ImageManager::device_load_image,
 					                        this,
 					                        device,
-					                        dscene,
 					                        scene,
 					                        (ImageDataType)type,
 					                        slot,
@@ -943,14 +916,10 @@ void ImageManager::device_update(Device *device,
 
 	pool.wait_work();
 
-	if(pack_images)
-		device_pack_images(device, dscene, progress);
-
 	need_update = false;
 }
 
 void ImageManager::device_update_slot(Device *device,
-                                      DeviceScene *dscene,
                                       Scene *scene,
                                       int flat_slot,
                                       Progress *progress)
@@ -962,12 +931,11 @@ void ImageManager::device_update_slot(Device *device,
 	assert(image != NULL);
 
 	if(image->users == 0) {
-		device_free_image(device, dscene, type, slot);
+		device_free_image(device, type, slot);
 	}
 	else if(image->need_load) {
 		if(!osl_texture_system || image->builtin_data)
 			device_load_image(device,
-			                  dscene,
 			                  scene,
 			                  type,
 			                  slot,
@@ -975,229 +943,24 @@ void ImageManager::device_update_slot(Device *device,
 	}
 }
 
-uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
-{
-	uint8_t options = 0;
-
-	/* Image Options are packed into one uint:
-	 * bit 0 -> Interpolation
-	 * bit 1 + 2  + 3-> Extension */
-	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST)
-		options |= (1 << 0);
-
-	if(images[type][slot]->extension == EXTENSION_REPEAT)
-		options |= (1 << 1);
-	else if(images[type][slot]->extension == EXTENSION_EXTEND)
-		options |= (1 << 2);
-	else /* EXTENSION_CLIP */
-		options |= (1 << 3);
-
-	return options;
-}
-
-void ImageManager::device_pack_images(Device *device,
-                                      DeviceScene *dscene,
-                                      Progress& /*progess*/)
-{
-	/* For OpenCL, we pack all image textures into a single large texture, and
-	 * do our own interpolation in the kernel. */
-	size_t size = 0, offset = 0;
-	ImageDataType type;
-
-	int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4]
-	                + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE];
-	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
-
-	/* Byte4 Textures*/
-	type = IMAGE_DATA_TYPE_BYTE4;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-		size += tex_img.size();
-	}
-
-	uchar4 *pixels_byte4 = dscene->tex_image_byte4_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	/* Float4 Textures*/
-	type = IMAGE_DATA_TYPE_FLOAT4;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-		size += tex_img.size();
-	}
-
-	float4 *pixels_float4 = dscene->tex_image_float4_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
-		/* todo: support 3D textures, only CPU for now */
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	/* Byte Textures*/
-	type = IMAGE_DATA_TYPE_BYTE;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-		size += tex_img.size();
-	}
-
-	uchar *pixels_byte = dscene->tex_image_byte_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	/* Float Textures*/
-	type = IMAGE_DATA_TYPE_FLOAT;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
-		size += tex_img.size();
-	}
-
-	float *pixels_float = dscene->tex_image_float_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
-		/* todo: support 3D textures, only CPU for now */
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	if(dscene->tex_image_byte4_packed.size()) {
-		if(dscene->tex_image_byte4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte4_packed);
-		}
-		device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed);
-	}
-	if(dscene->tex_image_float4_packed.size()) {
-		if(dscene->tex_image_float4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float4_packed);
-		}
-		device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed);
-	}
-	if(dscene->tex_image_byte_packed.size()) {
-		if(dscene->tex_image_byte_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte_packed);
-		}
-		device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed);
-	}
-	if(dscene->tex_image_float_packed.size()) {
-		if(dscene->tex_image_float_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float_packed);
-		}
-		device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed);
-	}
-	if(dscene->tex_image_packed_info.size()) {
-		if(dscene->tex_image_packed_info.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_packed_info);
-		}
-		device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
-	}
-}
-
-void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene)
+void ImageManager::device_free_builtin(Device *device)
 {
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 		for(size_t slot = 0; slot < images[type].size(); slot++) {
 			if(images[type][slot] && images[type][slot]->builtin_data)
-				device_free_image(device, dscene, (ImageDataType)type, slot);
+				device_free_image(device, (ImageDataType)type, slot);
 		}
 	}
 }
 
-void ImageManager::device_free(Device *device, DeviceScene *dscene)
+void ImageManager::device_free(Device *device)
 {
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 		for(size_t slot = 0; slot < images[type].size(); slot++) {
-			device_free_image(device, dscene, (ImageDataType)type, slot);
+			device_free_image(device, (ImageDataType)type, slot);
 		}
 		images[type].clear();
 	}
-
-	device->tex_free(dscene->tex_image_byte4_packed);
-	device->tex_free(dscene->tex_image_float4_packed);
-	device->tex_free(dscene->tex_image_byte_packed);
-	device->tex_free(dscene->tex_image_float_packed);
-	device->tex_free(dscene->tex_image_packed_info);
-
-	dscene->tex_image_byte4_packed.clear();
-	dscene->tex_image_float4_packed.clear();
-	dscene->tex_image_byte_packed.clear();
-	dscene->tex_image_float_packed.clear();
-	dscene->tex_image_packed_info.clear();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 494c74f0cdd..5391490d993 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,46 +17,46 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
-#include "device.h"
-#include "device_memory.h"
+#include "device/device.h"
+#include "device/device_memory.h"
 
-#include "util_image.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class Device;
-class DeviceScene;
 class Progress;
 class Scene;
 
+class ImageMetaData {
+public:
+	/* Must be set by image file or builtin callback. */
+	bool is_float, is_half;
+	int channels;
+	size_t width, height, depth;
+	bool builtin_free_cache;
+
+	/* Automatically set. */
+	ImageDataType type;
+	bool is_linear;
+};
+
 class ImageManager {
 public:
 	explicit ImageManager(const DeviceInfo& info);
 	~ImageManager();
 
-	enum ImageDataType {
-		IMAGE_DATA_TYPE_FLOAT4 = 0,
-		IMAGE_DATA_TYPE_BYTE4 = 1,
-		IMAGE_DATA_TYPE_HALF4 = 2,
-		IMAGE_DATA_TYPE_FLOAT = 3,
-		IMAGE_DATA_TYPE_BYTE = 4,
-		IMAGE_DATA_TYPE_HALF = 5,
-
-		IMAGE_DATA_NUM_TYPES
-	};
-
 	int add_image(const string& filename,
 	              void *builtin_data,
 	              bool animated,
 	              float frame,
-	              bool& is_float,
-	              bool& is_linear,
 	              InterpolationType interpolation,
 	              ExtensionType extension,
-	              bool use_alpha);
+	              bool use_alpha,
+	              ImageMetaData& metadata);
 	void remove_image(int flat_slot);
 	void remove_image(const string& filename,
 	                  void *builtin_data,
@@ -68,24 +68,25 @@ public:
 	                      InterpolationType interpolation,
 	                      ExtensionType extension,
 	                      bool use_alpha);
-	ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear);
+	bool get_image_metadata(const string& filename,
+	                        void *builtin_data,
+	                        ImageMetaData& metadata);
 
 	void device_update(Device *device,
-	                   DeviceScene *dscene,
 	                   Scene *scene,
 	                   Progress& progress);
 	void device_update_slot(Device *device,
-	                        DeviceScene *dscene,
 	                        Scene *scene,
 	                        int flat_slot,
 	                        Progress *progress);
-	void device_free(Device *device, DeviceScene *dscene);
-	void device_free_builtin(Device *device, DeviceScene *dscene);
+	void device_free(Device *device);
+	void device_free_builtin(Device *device);
 
 	void set_osl_texture_system(void *texture_system);
-	void set_pack_images(bool pack_images_);
 	bool set_animation_frame_update(int frame);
 
+	device_memory *image_memory(int flat_slot);
+
 	bool need_update;
 
 	/* NOTE: Here pixels_size is a size of storage, which equals to
@@ -94,23 +95,22 @@ public:
 	 */
 	function<void(const string &filename,
 	              void *data,
-	              bool &is_float,
-	              int &width,
-	              int &height,
-	              int &depth,
-	              int &channels)> builtin_image_info_cb;
+	              ImageMetaData& metadata)> builtin_image_info_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              unsigned char *pixels,
-	              const size_t pixels_size)> builtin_image_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_pixels_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              float *pixels,
-	              const size_t pixels_size)> builtin_image_float_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
 		void *builtin_data;
+		bool builtin_free_cache;
 
 		bool use_alpha;
 		bool need_load;
@@ -119,21 +119,29 @@ public:
 		InterpolationType interpolation;
 		ExtensionType extension;
 
+		string mem_name;
+		device_memory *mem;
+
 		int users;
 	};
 
 private:
 	int tex_num_images[IMAGE_DATA_NUM_TYPES];
-	int tex_start_images[IMAGE_DATA_NUM_TYPES];
+	int max_num_images;
+	bool has_half_images;
 
 	thread_mutex device_mutex;
 	int animation_frame;
 
 	vector<Image*> images[IMAGE_DATA_NUM_TYPES];
 	void *osl_texture_system;
-	bool pack_images;
 
-	bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
+	bool file_load_image_generic(Image *img,
+	                             ImageInput **in,
+	                             int &width,
+	                             int &height,
+	                             int &depth,
+	                             int &components);
 
 	template<TypeDesc::BASETYPE FileFormat,
 	         typename StorageType,
@@ -143,26 +151,19 @@ private:
 	                     int texture_limit,
 	                     device_vector<DeviceType>& tex_img);
 
+	int max_flattened_slot(ImageDataType type);
 	int type_index_to_flattened_slot(int slot, ImageDataType type);
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
 	string name_from_type(int type);
 
-	uint8_t pack_image_options(ImageDataType type, size_t slot);
-
 	void device_load_image(Device *device,
-	                       DeviceScene *dscene,
 	                       Scene *scene,
 	                       ImageDataType type,
 	                       int slot,
 	                       Progress *progess);
 	void device_free_image(Device *device,
-	                       DeviceScene *dscene,
 	                       ImageDataType type,
 	                       int slot);
-
-	void device_pack_images(Device *device,
-	                        DeviceScene *dscene,
-	                        Progress& progess);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 1ab0f9874f2..9c276bcab31 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -14,16 +14,17 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "scene.h"
-#include "shader.h"
-#include "sobol.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
+#include "device/device.h"
+#include "render/background.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/sobol.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,7 +32,6 @@ NODE_DEFINE(Integrator)
 {
 	NodeType *type = NodeType::add("integrator", create);
 
-	SOCKET_INT(min_bounce, "Min Bounce", 2);
 	SOCKET_INT(max_bounce, "Max Bounce", 7);
 
 	SOCKET_INT(max_diffuse_bounce, "Max Diffuse Bounce", 7);
@@ -39,9 +39,7 @@ NODE_DEFINE(Integrator)
 	SOCKET_INT(max_transmission_bounce, "Max Transmission Bounce", 7);
 	SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7);
 
-	SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 2);
 	SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
-	SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false);
 
 	SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
@@ -104,7 +102,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 	/* integrator parameters */
 	kintegrator->max_bounce = max_bounce + 1;
-	kintegrator->min_bounce = min_bounce + 1;
 
 	kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1;
 	kintegrator->max_glossy_bounce = max_glossy_bounce + 1;
@@ -112,7 +109,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->max_volume_bounce = max_volume_bounce + 1;
 
 	kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
-	kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
 
 	if(ao_bounces == 0) {
 		kintegrator->ao_bounces = INT_MAX;
@@ -125,19 +121,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	 * We only need to enable transparent shadows, if we actually have 
 	 * transparent shaders in the scene. Otherwise we can disable it
 	 * to improve performance a bit. */
-	if(transparent_shadows) {
-		kintegrator->transparent_shadows = false;
-		foreach(Shader *shader, scene->shaders) {
-			/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
-			if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
-				kintegrator->transparent_shadows = true;
-				break;
-			}
+	kintegrator->transparent_shadows = false;
+	foreach(Shader *shader, scene->shaders) {
+		/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
+		if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
+			kintegrator->transparent_shadows = true;
+			break;
 		}
 	}
-	else {
-		kintegrator->transparent_shadows = false;
-	}
 
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
@@ -155,6 +146,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->sample_clamp_indirect = (sample_clamp_indirect == 0.0f)? FLT_MAX: sample_clamp_indirect*3.0f;
 
 	kintegrator->branched = (method == BRANCHED_PATH);
+	kintegrator->volume_decoupled = device->info.has_volume_decoupled;
 	kintegrator->diffuse_samples = diffuse_samples;
 	kintegrator->glossy_samples = glossy_samples;
 	kintegrator->transmission_samples = transmission_samples;
@@ -195,16 +187,21 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 		max_samples = max(max_samples, volume_samples);
 	}
 
-	max_samples *= (max_bounce + transparent_max_bounce + 3 + BSSRDF_MAX_HITS);
+	uint total_bounces = max_bounce +
+	                     transparent_max_bounce + 3 +
+	                     VOLUME_BOUNDS_MAX +
+	                     max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
+
+	max_samples *= total_bounces;
 
 	int dimensions = PRNG_BASE_NUM + max_samples*PRNG_BOUNCE_NUM;
 	dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
-	uint *directions = dscene->sobol_directions.resize(SOBOL_BITS*dimensions);
+	uint *directions = dscene->sobol_directions.alloc(SOBOL_BITS*dimensions);
 
 	sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
-	device->tex_alloc("__sobol_directions", dscene->sobol_directions);
+	dscene->sobol_directions.copy_to_device();
 
 	/* Clamping. */
 	bool use_sample_clamp = (sample_clamp_direct != 0.0f ||
@@ -217,10 +214,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	need_update = false;
 }
 
-void Integrator::device_free(Device *device, DeviceScene *dscene)
+void Integrator::device_free(Device *, DeviceScene *dscene)
 {
-	device->tex_free(dscene->sobol_directions);
-	dscene->sobol_directions.clear();
+	dscene->sobol_directions.free();
 }
 
 bool Integrator::modified(const Integrator& integrator)
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 27fff4831e5..3cb430d72b4 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -17,9 +17,9 @@
 #ifndef __INTEGRATOR_H__
 #define __INTEGRATOR_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,9 +29,8 @@ class Scene;
 
 class Integrator : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
-	int min_bounce;
 	int max_bounce;
 
 	int max_diffuse_bounce;
@@ -39,9 +38,7 @@ public:
 	int max_transmission_bounce;
 	int max_volume_bounce;
 
-	int transparent_min_bounce;
 	int transparent_max_bounce;
-	bool transparent_shadows;
 
 	int ao_bounces;
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 6a4557506c3..8dec7e4ea64 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_logging.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -36,10 +36,10 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	int width = res;
 	int height = res;
 
-	device_vector<uint4> d_input;
-	device_vector<float4> d_output;
+	device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY);
+	device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE);
 
-	uint4 *d_input_data = d_input.resize(width*height);
+	uint4 *d_input_data = d_input.alloc(width*height);
 
 	for(int y = 0; y < height; y++) {
 		for(int x = 0; x < width; x++) {
@@ -52,15 +52,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	}
 
 	/* compute on device */
-	d_output.resize(width*height);
-	memset((void*)d_output.data_pointer, 0, d_output.memory_size());
+	d_output.alloc(width*height);
+	d_output.zero_to_device();
+	d_input.copy_to_device();
 
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
-	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
-
 	DeviceTask main_task(DeviceTask::SHADER);
 	main_task.shader_input = d_input.device_pointer;
 	main_task.shader_output = d_output.device_pointer;
@@ -77,15 +74,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	foreach(DeviceTask& task, split_tasks) {
 		device->task_add(task);
 		device->task_wait();
-		device->mem_copy_from(d_output, task.shader_x, 1, task.shader_w, sizeof(float4));
+		d_output.copy_from_device(task.shader_x, 1, task.shader_w);
 	}
 
-	device->mem_free(d_input);
-	device->mem_free(d_output);
-
-	d_input.clear();
+	d_input.free();
 
-	float4 *d_output_data = reinterpret_cast<float4*>(d_output.data_pointer);
+	float4 *d_output_data = d_output.data();
 
 	pixels.resize(width*height);
 
@@ -96,6 +90,8 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 			pixels[y*width + x].z = d_output_data[y*width + x].z;
 		}
 	}
+
+	d_output.free();
 }
 
 /* Light */
@@ -138,6 +134,7 @@ NODE_DEFINE(Light)
 
 	SOCKET_INT(samples, "Samples", 1);
 	SOCKET_INT(max_bounces, "Max Bounces", 1024);
+	SOCKET_UINT(random_id, "Random ID", 0);
 
 	SOCKET_BOOLEAN(is_portal, "Is Portal", false);
 	SOCKET_BOOLEAN(is_enabled, "Is Enabled", true);
@@ -224,12 +221,12 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 
 bool LightManager::object_usable_as_light(Object *object) {
 	Mesh *mesh = object->mesh;
-	/* Skip if we are not visible for BSDFs. */
-	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
+	/* Skip objects with NaNs */
+	if(!object->bounds.valid()) {
 		return false;
 	}
-	/* Skip motion blurred deforming meshes, not supported yet. */
-	if(mesh->has_motion_blur()) {
+	/* Skip if we are not visible for BSDFs. */
+	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
 		return false;
 	}
 	/* Skip if we have no emission shaders. */
@@ -245,7 +242,7 @@ bool LightManager::object_usable_as_light(Object *object) {
 	return false;
 }
 
-void LightManager::device_update_distribution(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void LightManager::device_update_distribution(Device *, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	progress.set_status("Updating Lights", "Computing distribution");
 
@@ -291,7 +288,7 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	VLOG(1) << "Total " << num_distribution << " of light distribution primitives.";
 
 	/* emission area */
-	float4 *distribution = dscene->light_distribution.resize(num_distribution + 1);
+	KernelLightDistribution *distribution = dscene->light_distribution.alloc(num_distribution + 1);
 	float totarea = 0.0f;
 
 	/* triangles */
@@ -337,13 +334,16 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 			                         : scene->default_surface;
 
 			if(shader->use_mis && shader->has_surface_emission) {
-				distribution[offset].x = totarea;
-				distribution[offset].y = __int_as_float(i + mesh->tri_offset);
-				distribution[offset].z = __int_as_float(shader_flag);
-				distribution[offset].w = __int_as_float(object_id);
+				distribution[offset].totarea = totarea;
+				distribution[offset].prim = i + mesh->tri_offset;
+				distribution[offset].mesh_light.shader_flag = shader_flag;
+				distribution[offset].mesh_light.object_id = object_id;
 				offset++;
 
 				Mesh::Triangle t = mesh->get_triangle(i);
+				if(!t.valid(&mesh->verts[0])) {
+					continue;
+				}
 				float3 p1 = mesh->verts[t.v[0]];
 				float3 p2 = mesh->verts[t.v[1]];
 				float3 p3 = mesh->verts[t.v[2]];
@@ -372,10 +372,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		if(!light->is_enabled)
 			continue;
 
-		distribution[offset].x = totarea;
-		distribution[offset].y = __int_as_float(~light_index);
-		distribution[offset].z = 1.0f;
-		distribution[offset].w = light->size;
+		distribution[offset].totarea = totarea;
+		distribution[offset].prim = ~light_index;
+		distribution[offset].lamp.pad = 1.0f;
+		distribution[offset].lamp.size = light->size;
 		totarea += lightarea;
 
 		if(light->size > 0.0f && light->use_mis)
@@ -390,15 +390,15 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	}
 
 	/* normalize cumulative distribution functions */
-	distribution[num_distribution].x = totarea;
-	distribution[num_distribution].y = 0.0f;
-	distribution[num_distribution].z = 0.0f;
-	distribution[num_distribution].w = 0.0f;
+	distribution[num_distribution].totarea = totarea;
+	distribution[num_distribution].prim = 0.0f;
+	distribution[num_distribution].lamp.pad = 0.0f;
+	distribution[num_distribution].lamp.size = 0.0f;
 
 	if(totarea > 0.0f) {
 		for(size_t i = 0; i < num_distribution; i++)
-			distribution[i].x /= totarea;
-		distribution[num_distribution].x = 1.0f;
+			distribution[i].totarea /= totarea;
+		distribution[num_distribution].totarea = 1.0f;
 	}
 
 	if(progress.get_cancel()) return;
@@ -415,7 +415,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		/* precompute pdfs */
 		kintegrator->pdf_triangles = 0.0f;
 		kintegrator->pdf_lights = 0.0f;
-		kintegrator->inv_pdf_lights = 0.0f;
 
 		/* sample one, with 0.5 probability of light or triangle */
 		kintegrator->num_all_lights = num_lights;
@@ -430,8 +429,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 			kintegrator->pdf_lights = 1.0f/num_lights;
 			if(trianglearea > 0.0f)
 				kintegrator->pdf_lights *= 0.5f;
-
-			kintegrator->inv_pdf_lights = 1.0f/kintegrator->pdf_lights;
 		}
 
 		kintegrator->use_lamp_mis = use_lamp_mis;
@@ -447,7 +444,7 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 			kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights)/(float)num_lights;
 
 		/* CDF */
-		device->tex_alloc("__light_distribution", dscene->light_distribution);
+		dscene->light_distribution.copy_to_device();
 
 		/* Portals */
 		if(num_portals > 0) {
@@ -462,13 +459,12 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		}
 	}
 	else {
-		dscene->light_distribution.clear();
+		dscene->light_distribution.free();
 
 		kintegrator->num_distribution = 0;
 		kintegrator->num_all_lights = 0;
 		kintegrator->pdf_triangles = 0.0f;
 		kintegrator->pdf_lights = 0.0f;
-		kintegrator->inv_pdf_lights = 0.0f;
 		kintegrator->use_lamp_mis = false;
 		kintegrator->num_portals = 0;
 		kintegrator->portal_offset = 0;
@@ -486,18 +482,10 @@ static void background_cdf(int start,
                            float2 *cond_cdf)
 {
 	/* Conditional CDFs (rows, U direction). */
-	/* NOTE: It is possible to have some NaN pixels on background
-	 * which will ruin CDF causing wrong shading. We replace such
-	 * pixels with black.
-	 */
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
 		float ave_luminance = average(env_color);
-		/* TODO(sergey): Consider adding average_safe(). */
-		if(!isfinite(ave_luminance)) {
-			ave_luminance = 0.0f;
-		}
 
 		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
@@ -505,9 +493,6 @@ static void background_cdf(int start,
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
 			ave_luminance = average(env_color);
-			if(!isfinite(ave_luminance)) {
-				ave_luminance = 0.0f;
-			}
 
 			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
@@ -568,8 +553,8 @@ void LightManager::device_update_background(Device *device,
 
 	/* build row distributions and column distribution for the infinite area environment light */
 	int cdf_count = res + 1;
-	float2 *marg_cdf = dscene->light_background_marginal_cdf.resize(cdf_count);
-	float2 *cond_cdf = dscene->light_background_conditional_cdf.resize(cdf_count * cdf_count);
+	float2 *marg_cdf = dscene->light_background_marginal_cdf.alloc(cdf_count);
+	float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_count * cdf_count);
 
 	double time_start = time_dt();
 	if(res < 512) {
@@ -618,11 +603,11 @@ void LightManager::device_update_background(Device *device,
 	VLOG(2) << "Background MIS build time " << time_dt() - time_start << "\n";
 
 	/* update device */
-	device->tex_alloc("__light_background_marginal_cdf", dscene->light_background_marginal_cdf);
-	device->tex_alloc("__light_background_conditional_cdf", dscene->light_background_conditional_cdf);
+	dscene->light_background_marginal_cdf.copy_to_device();
+	dscene->light_background_conditional_cdf.copy_to_device();
 }
 
-void LightManager::device_update_points(Device *device,
+void LightManager::device_update_points(Device *,
                                         DeviceScene *dscene,
                                         Scene *scene)
 {
@@ -635,7 +620,7 @@ void LightManager::device_update_points(Device *device,
 		}
 	}
 
-	float4 *light_data = dscene->light_data.resize(num_lights*LIGHT_SIZE);
+	KernelLight *klights = dscene->lights.alloc(num_lights);
 
 	if(num_lights == 0) {
 		VLOG(1) << "No effective light, ignoring points update.";
@@ -652,8 +637,9 @@ void LightManager::device_update_points(Device *device,
 		float3 co = light->co;
 		Shader *shader = (light->shader) ? light->shader : scene->default_light;
 		int shader_id = scene->shader_manager->get_shader_id(shader);
-		float samples = __int_as_float(light->samples);
-		float max_bounces = __int_as_float(light->max_bounces);
+		int samples = light->samples;
+		int max_bounces = light->max_bounces;
+		float random = (float)light->random_id * (1.0f/(float)0xFFFFFFFF);
 
 		if(!light->cast_shadow)
 			shader_id &= ~SHADER_CAST_SHADOW;
@@ -675,6 +661,9 @@ void LightManager::device_update_points(Device *device,
 			use_light_visibility = true;
 		}
 
+		klights[light_index].type = light->type;
+		klights[light_index].samples = samples;
+
 		if(light->type == LIGHT_POINT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
 
@@ -684,10 +673,12 @@ void LightManager::device_update_points(Device *device,
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			klights[light_index].co[0] = co.x;
+			klights[light_index].co[1] = co.y;
+			klights[light_index].co[2] = co.z;
+
+			klights[light_index].spot.radius = radius;
+			klights[light_index].spot.invarea = invarea;
 		}
 		else if(light->type == LIGHT_DISTANT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -704,10 +695,13 @@ void LightManager::device_update_points(Device *device,
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z);
-			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
-			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			klights[light_index].co[0] = dir.x;
+			klights[light_index].co[1] = dir.y;
+			klights[light_index].co[2] = dir.z;
+
+			klights[light_index].distant.invarea = invarea;
+			klights[light_index].distant.radius = radius;
+			klights[light_index].distant.cosangle = cosangle;
 		}
 		else if(light->type == LIGHT_BACKGROUND) {
 			uint visibility = scene->background->visibility;
@@ -731,11 +725,6 @@ void LightManager::device_update_points(Device *device,
 				shader_id |= SHADER_EXCLUDE_SCATTER;
 				use_light_visibility = true;
 			}
-
-			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_AREA) {
 			float3 axisu = light->axisu*(light->sizeu*light->size);
@@ -749,10 +738,20 @@ void LightManager::device_update_points(Device *device,
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
-			light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
-			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
+			klights[light_index].co[0] = co.x;
+			klights[light_index].co[1] = co.y;
+			klights[light_index].co[2] = co.z;
+
+			klights[light_index].area.axisu[0] = axisu.x;
+			klights[light_index].area.axisu[1] = axisu.y;
+			klights[light_index].area.axisu[2] = axisu.z;
+			klights[light_index].area.axisv[0] = axisv.x;
+			klights[light_index].area.axisv[1] = axisv.y;
+			klights[light_index].area.axisv[2] = axisv.z;
+			klights[light_index].area.invarea = invarea;
+			klights[light_index].area.dir[0] = dir.x;
+			klights[light_index].area.dir[1] = dir.y;
+			klights[light_index].area.dir[2] = dir.z;
 		}
 		else if(light->type == LIGHT_SPOT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -768,18 +767,26 @@ void LightManager::device_update_points(Device *device,
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
-			light_data[light_index*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
-			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			klights[light_index].co[0] = co.x;
+			klights[light_index].co[1] = co.y;
+			klights[light_index].co[2] = co.z;
+
+			klights[light_index].spot.radius = radius;
+			klights[light_index].spot.invarea = invarea;
+			klights[light_index].spot.spot_angle = spot_angle;
+			klights[light_index].spot.spot_smooth = spot_smooth;
+			klights[light_index].spot.dir[0] = dir.x;
+			klights[light_index].spot.dir[1] = dir.y;
+			klights[light_index].spot.dir[2] = dir.z;
 		}
 
-		light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+		klights[light_index].shader_id = shader_id;
 
-		Transform tfm = light->tfm;
-		Transform itfm = transform_inverse(tfm);
-		memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3);
-		memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3);
+		klights[light_index].max_bounces = max_bounces;
+		klights[light_index].random = random;
+
+		klights[light_index].tfm = light->tfm;
+		klights[light_index].itfm = transform_inverse(light->tfm);
 
 		light_index++;
 	}
@@ -796,21 +803,27 @@ void LightManager::device_update_points(Device *device,
 		float3 axisu = light->axisu*(light->sizeu*light->size);
 		float3 axisv = light->axisv*(light->sizev*light->size);
 		float area = len(axisu)*len(axisv);
-		float invarea = (area > 0.0f) ? 1.0f / area : 1.0f;
+		float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
 		float3 dir = light->dir;
 
 		dir = safe_normalize(dir);
 
-		light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-		light_data[light_index*LIGHT_SIZE + 1] = make_float4(area, axisu.x, axisu.y, axisu.z);
-		light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
-		light_data[light_index*LIGHT_SIZE + 3] = make_float4(-1, dir.x, dir.y, dir.z);
-		light_data[light_index*LIGHT_SIZE + 4] = make_float4(-1, 0.0f, 0.0f, 0.0f);
-
-		Transform tfm = light->tfm;
-		Transform itfm = transform_inverse(tfm);
-		memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3);
-		memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3);
+		klights[light_index].co[0] = co.x;
+		klights[light_index].co[1] = co.y;
+		klights[light_index].co[2] = co.z;
+
+		klights[light_index].area.axisu[0] = axisu.x;
+		klights[light_index].area.axisu[1] = axisu.y;
+		klights[light_index].area.axisu[2] = axisu.z;
+		klights[light_index].area.axisv[0] = axisv.x;
+		klights[light_index].area.axisv[1] = axisv.y;
+		klights[light_index].area.axisv[2] = axisv.z;
+		klights[light_index].area.invarea = invarea;
+		klights[light_index].area.dir[0] = dir.x;
+		klights[light_index].area.dir[1] = dir.y;
+		klights[light_index].area.dir[2] = dir.z;
+		klights[light_index].tfm = light->tfm;
+		klights[light_index].itfm = transform_inverse(light->tfm);
 
 		light_index++;
 	}
@@ -820,7 +833,7 @@ void LightManager::device_update_points(Device *device,
 	VLOG(1) << "Number of lights without contribution: "
 	        << num_scene_lights - light_index;
 
-	device->tex_alloc("__light_data", dscene->light_data);
+	dscene->lights.copy_to_device();
 }
 
 void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
@@ -853,17 +866,12 @@ void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *sce
 	need_update = false;
 }
 
-void LightManager::device_free(Device *device, DeviceScene *dscene)
+void LightManager::device_free(Device *, DeviceScene *dscene)
 {
-	device->tex_free(dscene->light_distribution);
-	device->tex_free(dscene->light_data);
-	device->tex_free(dscene->light_background_marginal_cdf);
-	device->tex_free(dscene->light_background_conditional_cdf);
-
-	dscene->light_distribution.clear();
-	dscene->light_data.clear();
-	dscene->light_background_marginal_cdf.clear();
-	dscene->light_background_conditional_cdf.clear();
+	dscene->light_distribution.free();
+	dscene->lights.free();
+	dscene->light_background_marginal_cdf.free();
+	dscene->light_background_conditional_cdf.free();
 }
 
 void LightManager::tag_update(Scene * /*scene*/)
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index f56530b6490..97b7b971c73 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -17,12 +17,12 @@
 #ifndef __LIGHT_H__
 #define __LIGHT_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -70,6 +70,7 @@ public:
 	Shader *shader;
 	int samples;
 	int max_bounces;
+	uint random_id;
 
 	void tag_update(Scene *scene);
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index b7660297f3e..7cfbb7b7c7d 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "graph.h"
-#include "shader.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_globals.h"
-
-#include "subd_split.h"
-#include "subd_patch_table.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_set.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/shader.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_globals.h"
+
+#include "subd/subd_split.h"
+#include "subd/subd_patch_table.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -107,6 +107,26 @@ void Mesh::Triangle::verts_for_step(const float3 *verts,
 	}
 }
 
+float3 Mesh::Triangle::compute_normal(const float3 *verts) const
+{
+	const float3& v0 = verts[v[0]];
+	const float3& v1 = verts[v[1]];
+	const float3& v2 = verts[v[2]];
+	const float3 norm = cross(v1 - v0, v2 - v0);
+	const float normlen = len(norm);
+	if(normlen == 0.0f) {
+		return make_float3(1.0f, 0.0f, 0.0f);
+	}
+	return norm / normlen;
+}
+
+bool Mesh::Triangle::valid(const float3 *verts) const
+{
+	return isfinite3_safe(verts[v[0]]) &&
+	       isfinite3_safe(verts[v[1]]) &&
+	       isfinite3_safe(verts[v[2]]);
+}
+
 /* Curve */
 
 void Mesh::Curve::bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox& bounds) const
@@ -416,6 +436,8 @@ Mesh::Mesh()
 	face_offset = 0;
 	corner_offset = 0;
 
+	attr_map_offset = 0;
+
 	num_subd_verts = 0;
 
 	attributes.triangle_mesh = this;
@@ -424,6 +446,7 @@ Mesh::Mesh()
 
 	geometry_flags = GEOMETRY_NONE;
 
+	volume_isovalue = 0.001f;
 	has_volume = false;
 	has_surface_bssrdf = false;
 
@@ -511,7 +534,7 @@ void Mesh::reserve_subd_faces(int numfaces, int num_ngons_, int numcorners)
 	subd_attributes.resize(true);
 }
 
-void Mesh::clear()
+void Mesh::clear(bool preserve_voxel_data)
 {
 	/* clear all verts and triangles */
 	verts.clear();
@@ -534,15 +557,19 @@ void Mesh::clear()
 
 	subd_creases.clear();
 
-	attributes.clear();
 	curve_attributes.clear();
 	subd_attributes.clear();
+	attributes.clear(preserve_voxel_data);
+
 	used_shaders.clear();
 
+	if(!preserve_voxel_data) {
+		geometry_flags = GEOMETRY_NONE;
+	}
+
 	transform_applied = false;
 	transform_negative_scaled = false;
 	transform_normal = transform_identity();
-	geometry_flags = GEOMETRY_NONE;
 
 	delete patch_table;
 	patch_table = NULL;
@@ -701,21 +728,6 @@ void Mesh::compute_bounds()
 	bounds = bnds;
 }
 
-static float3 compute_face_normal(const Mesh::Triangle& t, float3 *verts)
-{
-	float3 v0 = verts[t.v[0]];
-	float3 v1 = verts[t.v[1]];
-	float3 v2 = verts[t.v[2]];
-
-	float3 norm = cross(v1 - v0, v2 - v0);
-	float normlen = len(norm);
-
-	if(normlen == 0.0f)
-		return make_float3(1.0f, 0.0f, 0.0f);
-
-	return norm / normlen;
-}
-
 void Mesh::add_face_normals()
 {
 	/* don't compute if already there */
@@ -733,7 +745,7 @@ void Mesh::add_face_normals()
 		float3 *verts_ptr = verts.data();
 
 		for(size_t i = 0; i < triangles_size; i++) {
-			fN[i] = compute_face_normal(get_triangle(i), verts_ptr);
+			fN[i] = get_triangle(i).compute_normal(verts_ptr);
 		}
 	}
 
@@ -795,7 +807,7 @@ void Mesh::add_vertex_normals()
 
 			for(size_t i = 0; i < triangles_size; i++) {
 				for(size_t j = 0; j < 3; j++) {
-					float3 fN = compute_face_normal(get_triangle(i), mP);
+					float3 fN = get_triangle(i).compute_normal(mP);
 					mN[get_triangle(i).v[j]] += fN;
 				}
 			}
@@ -865,15 +877,8 @@ void Mesh::add_undisplaced()
 	}
 }
 
-void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
+void Mesh::pack_shaders(Scene *scene, uint *tri_shader)
 {
-	Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
-	if(attr_vN == NULL) {
-		/* Happens on objects with just hair. */
-		return;
-	}
-
-	float3 *vN = attr_vN->data_float3();
 	uint shader_id = 0;
 	uint last_shader = -1;
 	bool last_smooth = false;
@@ -881,10 +886,6 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 	size_t triangles_size = num_triangles();
 	int *shader_ptr = shader.data();
 
-	bool do_transform = transform_applied;
-	Transform ntfm = transform_normal;
-
-	/* save shader */
 	for(size_t i = 0; i < triangles_size; i++) {
 		if(shader_ptr[i] != last_shader || last_smooth != smooth[i]) {
 			last_shader = shader_ptr[i];
@@ -896,14 +897,27 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 
 		tri_shader[i] = shader_id;
 	}
+}
+
+void Mesh::pack_normals(float4 *vnormal)
+{
+	Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
+	if(attr_vN == NULL) {
+		/* Happens on objects with just hair. */
+		return;
+	}
 
+	bool do_transform = transform_applied;
+	Transform ntfm = transform_normal;
+
+	float3 *vN = attr_vN->data_float3();
 	size_t verts_size = verts.size();
 
 	for(size_t i = 0; i < verts_size; i++) {
 		float3 vNi = vN[i];
 
 		if(do_transform)
-			vNi = normalize(transform_direction(&ntfm, vNi));
+			vNi = safe_normalize(transform_direction(&ntfm, vNi));
 
 		vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
 	}
@@ -1016,7 +1030,8 @@ void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, ui
 	}
 }
 
-void Mesh::compute_bvh(DeviceScene *dscene,
+void Mesh::compute_bvh(Device *device,
+                       DeviceScene *dscene,
                        SceneParams *params,
                        Progress *progress,
                        int n,
@@ -1050,7 +1065,9 @@ void Mesh::compute_bvh(DeviceScene *dscene,
 
 			BVHParams bparams;
 			bparams.use_spatial_split = params->use_bvh_spatial_split;
-			bparams.use_qbvh = params->use_qbvh;
+			bparams.bvh_layout = BVHParams::best_bvh_layout(
+			        params->bvh_layout,
+			        device->info.bvh_layout_mask);
 			bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 			                              params->use_bvh_unaligned_nodes;
 			bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
@@ -1102,6 +1119,32 @@ bool Mesh::has_true_displacement() const
 	return false;
 }
 
+float Mesh::motion_time(int step) const
+{
+	return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f;
+}
+
+int Mesh::motion_step(float time) const
+{
+	if(motion_steps > 1) {
+		int attr_step = 0;
+
+		for(int step = 0; step < motion_steps; step++) {
+			float step_time = motion_time(step);
+			if(step_time == time) {
+				return attr_step;
+			}
+
+			/* Center step is stored in a separate attribute. */
+			if(step != motion_steps / 2) {
+				attr_step++;
+			}
+		}
+	}
+
+	return -1;
+}
+
 bool Mesh::need_build_bvh() const
 {
 	return !transform_applied || has_surface_bssrdf;
@@ -1121,14 +1164,12 @@ bool Mesh::is_instanced() const
 
 MeshManager::MeshManager()
 {
-	bvh = NULL;
 	need_update = true;
 	need_flags_update = true;
 }
 
 MeshManager::~MeshManager()
 {
-	delete bvh;
 }
 
 void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<AttributeRequestSet>& mesh_attributes)
@@ -1248,39 +1289,33 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 #endif
 }
 
-void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes)
+void MeshManager::update_svm_attributes(Device *, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes)
 {
 	/* for SVM, the attributes_map table is used to lookup the offset of an
 	 * attribute, based on a unique shader attribute id. */
 
 	/* compute array stride */
-	int attr_map_stride = 0;
+	int attr_map_size = 0;
 
-	for(size_t i = 0; i < scene->meshes.size(); i++)
-		attr_map_stride = max(attr_map_stride, (mesh_attributes[i].size() + 1)*ATTR_PRIM_TYPES);
+	for(size_t i = 0; i < scene->meshes.size(); i++) {
+		Mesh *mesh = scene->meshes[i];
+		mesh->attr_map_offset = attr_map_size;
+		attr_map_size += (mesh_attributes[i].size() + 1)*ATTR_PRIM_TYPES;
+	}
 
-	if(attr_map_stride == 0)
+	if(attr_map_size == 0)
 		return;
 
 	/* create attribute map */
-	uint4 *attr_map = dscene->attributes_map.resize(attr_map_stride*scene->objects.size());
+	uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size*scene->meshes.size());
 	memset(attr_map, 0, dscene->attributes_map.size()*sizeof(uint));
 
-	for(size_t i = 0; i < scene->objects.size(); i++) {
-		Object *object = scene->objects[i];
-		Mesh *mesh = object->mesh;
-
-		/* find mesh attributes */
-		size_t j;
-
-		for(j = 0; j < scene->meshes.size(); j++)
-			if(scene->meshes[j] == mesh)
-				break;
-
-		AttributeRequestSet& attributes = mesh_attributes[j];
+	for(size_t i = 0; i < scene->meshes.size(); i++) {
+		Mesh *mesh = scene->meshes[i];
+		AttributeRequestSet& attributes = mesh_attributes[i];
 
 		/* set object attributes */
-		int index = i*attr_map_stride;
+		int index = mesh->attr_map_offset;
 
 		foreach(AttributeRequest& req, attributes.requests) {
 			uint id;
@@ -1354,8 +1389,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 	}
 
 	/* copy to device */
-	dscene->data.bvh.attributes_map_stride = attr_map_stride;
-	device->tex_alloc("__attributes_map", dscene->attributes_map);
+	dscene->attributes_map.copy_to_device();
 }
 
 static void update_attribute_element_size(Mesh *mesh,
@@ -1387,11 +1421,11 @@ static void update_attribute_element_size(Mesh *mesh,
 }
 
 static void update_attribute_element_offset(Mesh *mesh,
-                                            vector<float>& attr_float,
+                                            device_vector<float>& attr_float,
                                             size_t& attr_float_offset,
-                                            vector<float4>& attr_float3,
+                                            device_vector<float4>& attr_float3,
                                             size_t& attr_float3_offset,
-                                            vector<uchar4>& attr_uchar4,
+                                            device_vector<uchar4>& attr_uchar4,
                                             size_t& attr_uchar4_offset,
                                             Attribute *mattr,
                                             AttributePrimitive prim,
@@ -1419,7 +1453,7 @@ static void update_attribute_element_offset(Mesh *mesh,
 			uchar4 *data = mattr->data_uchar4();
 			offset = attr_uchar4_offset;
 
-			assert(attr_uchar4.capacity() >= offset + size);
+			assert(attr_uchar4.size() >= offset + size);
 			for(size_t k = 0; k < size; k++) {
 				attr_uchar4[offset+k] = data[k];
 			}
@@ -1429,7 +1463,7 @@ static void update_attribute_element_offset(Mesh *mesh,
 			float *data = mattr->data_float();
 			offset = attr_float_offset;
 
-			assert(attr_float.capacity() >= offset + size);
+			assert(attr_float.size() >= offset + size);
 			for(size_t k = 0; k < size; k++) {
 				attr_float[offset+k] = data[k];
 			}
@@ -1439,17 +1473,17 @@ static void update_attribute_element_offset(Mesh *mesh,
 			Transform *tfm = mattr->data_transform();
 			offset = attr_float3_offset;
 
-			assert(attr_float3.capacity() >= offset + size * 4);
-			for(size_t k = 0; k < size*4; k++) {
+			assert(attr_float3.size() >= offset + size * 3);
+			for(size_t k = 0; k < size*3; k++) {
 				attr_float3[offset+k] = (&tfm->x)[k];
 			}
-			attr_float3_offset += size * 4;
+			attr_float3_offset += size * 3;
 		}
 		else {
 			float4 *data = mattr->data_float4();
 			offset = attr_float3_offset;
 
-			assert(attr_float3.capacity() >= offset + size);
+			assert(attr_float3.size() >= offset + size);
 			for(size_t k = 0; k < size; k++) {
 				attr_float3[offset+k] = data[k];
 			}
@@ -1550,9 +1584,9 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 		}
 	}
 
-	vector<float> attr_float(attr_float_size);
-	vector<float4> attr_float3(attr_float3_size);
-	vector<uchar4> attr_uchar4(attr_uchar4_size);
+	dscene->attributes_float.alloc(attr_float_size);
+	dscene->attributes_float3.alloc(attr_float3_size);
+	dscene->attributes_uchar4.alloc(attr_uchar4_size);
 
 	size_t attr_float_offset = 0;
 	size_t attr_float3_offset = 0;
@@ -1571,27 +1605,27 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 			Attribute *subd_mattr = mesh->subd_attributes.find(req);
 
 			update_attribute_element_offset(mesh,
-			                                attr_float, attr_float_offset,
-			                                attr_float3, attr_float3_offset,
-			                                attr_uchar4, attr_uchar4_offset,
+			                                dscene->attributes_float, attr_float_offset,
+			                                dscene->attributes_float3, attr_float3_offset,
+			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                triangle_mattr,
 			                                ATTR_PRIM_TRIANGLE,
 			                                req.triangle_type,
 			                                req.triangle_desc);
 
 			update_attribute_element_offset(mesh,
-			                                attr_float, attr_float_offset,
-			                                attr_float3, attr_float3_offset,
-			                                attr_uchar4, attr_uchar4_offset,
+			                                dscene->attributes_float, attr_float_offset,
+			                                dscene->attributes_float3, attr_float3_offset,
+			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                curve_mattr,
 			                                ATTR_PRIM_CURVE,
 			                                req.curve_type,
 			                                req.curve_desc);
 
 			update_attribute_element_offset(mesh,
-			                                attr_float, attr_float_offset,
-			                                attr_float3, attr_float3_offset,
-			                                attr_uchar4, attr_uchar4_offset,
+			                                dscene->attributes_float, attr_float_offset,
+			                                dscene->attributes_float3, attr_float3_offset,
+			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                subd_mattr,
 			                                ATTR_PRIM_SUBD,
 			                                req.subd_type,
@@ -1612,18 +1646,21 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	/* copy to device */
 	progress.set_status("Updating Mesh", "Copying Attributes to device");
 
-	if(attr_float.size()) {
-		dscene->attributes_float.copy(&attr_float[0], attr_float.size());
-		device->tex_alloc("__attributes_float", dscene->attributes_float);
+	if(dscene->attributes_float.size()) {
+		dscene->attributes_float.copy_to_device();
 	}
-	if(attr_float3.size()) {
-		dscene->attributes_float3.copy(&attr_float3[0], attr_float3.size());
-		device->tex_alloc("__attributes_float3", dscene->attributes_float3);
+	if(dscene->attributes_float3.size()) {
+		dscene->attributes_float3.copy_to_device();
 	}
-	if(attr_uchar4.size()) {
-		dscene->attributes_uchar4.copy(&attr_uchar4[0], attr_uchar4.size());
-		device->tex_alloc("__attributes_uchar4", dscene->attributes_uchar4);
+	if(dscene->attributes_uchar4.size()) {
+		dscene->attributes_uchar4.copy_to_device();
 	}
+
+	if(progress.get_cancel()) return;
+
+	/* After mesh attributes and patch tables have been copied to device memory,
+	 * we need to update offsets in the objects. */
+	scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
 }
 
 void MeshManager::mesh_calc_offset(Scene *scene)
@@ -1670,7 +1707,7 @@ void MeshManager::mesh_calc_offset(Scene *scene)
 	}
 }
 
-void MeshManager::device_update_mesh(Device *device,
+void MeshManager::device_update_mesh(Device *,
                                      DeviceScene *dscene,
                                      Scene *scene,
                                      bool for_displacement,
@@ -1719,10 +1756,9 @@ void MeshManager::device_update_mesh(Device *device,
 		}
 	}
 	else {
-		PackedBVH& pack = bvh->pack;
-		for(size_t i = 0; i < pack.prim_index.size(); ++i) {
-			if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
-				tri_prim_index[pack.prim_index[i]] = pack.prim_tri_index[i];
+		for(size_t i = 0; i < dscene->prim_index.size(); ++i) {
+			if((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+				tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i];
 			}
 		}
 	}
@@ -1732,16 +1768,16 @@ void MeshManager::device_update_mesh(Device *device,
 		/* normals */
 		progress.set_status("Updating Mesh", "Computing normals");
 
-		uint *tri_shader = dscene->tri_shader.resize(tri_size);
-		float4 *vnormal = dscene->tri_vnormal.resize(vert_size);
-		uint4 *tri_vindex = dscene->tri_vindex.resize(tri_size);
-		uint *tri_patch = dscene->tri_patch.resize(tri_size);
-		float2 *tri_patch_uv = dscene->tri_patch_uv.resize(vert_size);
+		uint *tri_shader = dscene->tri_shader.alloc(tri_size);
+		float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+		uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
+		uint *tri_patch = dscene->tri_patch.alloc(tri_size);
+		float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
 
 		foreach(Mesh *mesh, scene->meshes) {
-			mesh->pack_normals(scene,
-			                   &tri_shader[mesh->tri_offset],
-			                   &vnormal[mesh->vert_offset]);
+			mesh->pack_shaders(scene,
+			                   &tri_shader[mesh->tri_offset]);
+			mesh->pack_normals(&vnormal[mesh->vert_offset]);
 			mesh->pack_verts(tri_prim_index,
 			                 &tri_vindex[mesh->tri_offset],
 			                 &tri_patch[mesh->tri_offset],
@@ -1754,32 +1790,32 @@ void MeshManager::device_update_mesh(Device *device,
 		/* vertex coordinates */
 		progress.set_status("Updating Mesh", "Copying Mesh to device");
 
-		device->tex_alloc("__tri_shader", dscene->tri_shader);
-		device->tex_alloc("__tri_vnormal", dscene->tri_vnormal);
-		device->tex_alloc("__tri_vindex", dscene->tri_vindex);
-		device->tex_alloc("__tri_patch", dscene->tri_patch);
-		device->tex_alloc("__tri_patch_uv", dscene->tri_patch_uv);
+		dscene->tri_shader.copy_to_device();
+		dscene->tri_vnormal.copy_to_device();
+		dscene->tri_vindex.copy_to_device();
+		dscene->tri_patch.copy_to_device();
+		dscene->tri_patch_uv.copy_to_device();
 	}
 
 	if(curve_size != 0) {
 		progress.set_status("Updating Mesh", "Copying Strands to device");
 
-		float4 *curve_keys = dscene->curve_keys.resize(curve_key_size);
-		float4 *curves = dscene->curves.resize(curve_size);
+		float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
+		float4 *curves = dscene->curves.alloc(curve_size);
 
 		foreach(Mesh *mesh, scene->meshes) {
 			mesh->pack_curves(scene, &curve_keys[mesh->curvekey_offset], &curves[mesh->curve_offset], mesh->curvekey_offset);
 			if(progress.get_cancel()) return;
 		}
 
-		device->tex_alloc("__curve_keys", dscene->curve_keys);
-		device->tex_alloc("__curves", dscene->curves);
+		dscene->curve_keys.copy_to_device();
+		dscene->curves.copy_to_device();
 	}
 
 	if(patch_size != 0) {
 		progress.set_status("Updating Mesh", "Copying Patches to device");
 
-		uint *patch_data = dscene->patches.resize(patch_size);
+		uint *patch_data = dscene->patches.alloc(patch_size);
 
 		foreach(Mesh *mesh, scene->meshes) {
 			mesh->pack_patches(&patch_data[mesh->patch_offset], mesh->vert_offset, mesh->face_offset, mesh->corner_offset);
@@ -1791,11 +1827,11 @@ void MeshManager::device_update_mesh(Device *device,
 			if(progress.get_cancel()) return;
 		}
 
-		device->tex_alloc("__patches", dscene->patches);
+		dscene->patches.copy_to_device();
 	}
 
 	if(for_displacement) {
-		float4 *prim_tri_verts = dscene->prim_tri_verts.resize(tri_size * 3);
+		float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3);
 		foreach(Mesh *mesh, scene->meshes) {
 			for(size_t i = 0; i < mesh->num_triangles(); ++i) {
 				Mesh::Triangle t = mesh->get_triangle(i);
@@ -1805,7 +1841,7 @@ void MeshManager::device_update_mesh(Device *device,
 				prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
 			}
 		}
-		device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts);
+		dscene->prim_tri_verts.copy_to_device();
 	}
 }
 
@@ -1814,23 +1850,27 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	/* bvh build */
 	progress.set_status("Updating Scene BVH", "Building");
 
-	VLOG(1) << (scene->params.use_qbvh ? "Using QBVH optimization structure"
-	                                   : "Using regular BVH optimization structure");
-
 	BVHParams bparams;
 	bparams.top_level = true;
-	bparams.use_qbvh = scene->params.use_qbvh;
+	bparams.bvh_layout = BVHParams::best_bvh_layout(
+	        scene->params.bvh_layout,
+	        device->info.bvh_layout_mask);
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
 	bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 	                              scene->params.use_bvh_unaligned_nodes;
 	bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
 	bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
 
-	delete bvh;
-	bvh = BVH::create(bparams, scene->objects);
+	VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout)
+	        << " layout.";
+
+	BVH *bvh = BVH::create(bparams, scene->objects);
 	bvh->build(progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel()) {
+		delete bvh;
+		return;
+	}
 
 	/* copy to device */
 	progress.set_status("Updating Scene BVH", "Copying BVH to device");
@@ -1838,62 +1878,69 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	PackedBVH& pack = bvh->pack;
 
 	if(pack.nodes.size()) {
-		dscene->bvh_nodes.reference((float4*)&pack.nodes[0], pack.nodes.size());
-		device->tex_alloc("__bvh_nodes", dscene->bvh_nodes);
+		dscene->bvh_nodes.steal_data(pack.nodes);
+		dscene->bvh_nodes.copy_to_device();
 	}
 	if(pack.leaf_nodes.size()) {
-		dscene->bvh_leaf_nodes.reference((float4*)&pack.leaf_nodes[0], pack.leaf_nodes.size());
-		device->tex_alloc("__bvh_leaf_nodes", dscene->bvh_leaf_nodes);
+		dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
+		dscene->bvh_leaf_nodes.copy_to_device();
 	}
 	if(pack.object_node.size()) {
-		dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size());
-		device->tex_alloc("__object_node", dscene->object_node);
+		dscene->object_node.steal_data(pack.object_node);
+		dscene->object_node.copy_to_device();
 	}
 	if(pack.prim_tri_index.size()) {
-		dscene->prim_tri_index.reference((uint*)&pack.prim_tri_index[0], pack.prim_tri_index.size());
-		device->tex_alloc("__prim_tri_index", dscene->prim_tri_index);
+		dscene->prim_tri_index.steal_data(pack.prim_tri_index);
+		dscene->prim_tri_index.copy_to_device();
 	}
 	if(pack.prim_tri_verts.size()) {
-		dscene->prim_tri_verts.reference((float4*)&pack.prim_tri_verts[0], pack.prim_tri_verts.size());
-		device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts);
+		dscene->prim_tri_verts.steal_data(pack.prim_tri_verts);
+		dscene->prim_tri_verts.copy_to_device();
 	}
 	if(pack.prim_type.size()) {
-		dscene->prim_type.reference((uint*)&pack.prim_type[0], pack.prim_type.size());
-		device->tex_alloc("__prim_type", dscene->prim_type);
+		dscene->prim_type.steal_data(pack.prim_type);
+		dscene->prim_type.copy_to_device();
 	}
 	if(pack.prim_visibility.size()) {
-		dscene->prim_visibility.reference((uint*)&pack.prim_visibility[0], pack.prim_visibility.size());
-		device->tex_alloc("__prim_visibility", dscene->prim_visibility);
+		dscene->prim_visibility.steal_data(pack.prim_visibility);
+		dscene->prim_visibility.copy_to_device();
 	}
 	if(pack.prim_index.size()) {
-		dscene->prim_index.reference((uint*)&pack.prim_index[0], pack.prim_index.size());
-		device->tex_alloc("__prim_index", dscene->prim_index);
+		dscene->prim_index.steal_data(pack.prim_index);
+		dscene->prim_index.copy_to_device();
 	}
 	if(pack.prim_object.size()) {
-		dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size());
-		device->tex_alloc("__prim_object", dscene->prim_object);
+		dscene->prim_object.steal_data(pack.prim_object);
+		dscene->prim_object.copy_to_device();
 	}
 	if(pack.prim_time.size()) {
-		dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size());
-		device->tex_alloc("__prim_time", dscene->prim_time);
+		dscene->prim_time.steal_data(pack.prim_time);
+		dscene->prim_time.copy_to_device();
 	}
 
 	dscene->data.bvh.root = pack.root_index;
-	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+	dscene->data.bvh.bvh_layout = bparams.bvh_layout;
 	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
+
+	delete bvh;
 }
 
-void MeshManager::device_update_flags(Device * /*device*/,
-                                      DeviceScene * /*dscene*/,
-                                      Scene * scene,
-                                      Progress& /*progress*/)
+void MeshManager::device_update_preprocess(Device *device,
+                                           Scene *scene,
+                                           Progress& progress)
 {
 	if(!need_update && !need_flags_update) {
 		return;
 	}
-	/* update flags */
+
+	progress.set_status("Updating Meshes Flags");
+
+	/* Update flags. */
+	bool volume_images_updated = false;
+
 	foreach(Mesh *mesh, scene->meshes) {
 		mesh->has_volume = false;
+
 		foreach(const Shader *shader, mesh->used_shaders) {
 			if(shader->has_volume) {
 				mesh->has_volume = true;
@@ -1902,12 +1949,33 @@ void MeshManager::device_update_flags(Device * /*device*/,
 				mesh->has_surface_bssrdf = true;
 			}
 		}
+
+		if(need_update && mesh->has_volume) {
+			/* Create volume meshes if there is voxel data. */
+			bool has_voxel_attributes = false;
+
+			foreach(Attribute& attr, mesh->attributes.attributes) {
+				if(attr.element == ATTR_ELEMENT_VOXEL) {
+					has_voxel_attributes = true;
+				}
+			}
+
+			if(has_voxel_attributes) {
+				if(!volume_images_updated) {
+					progress.set_status("Updating Meshes Volume Bounds");
+					device_update_volume_images(device, scene, progress);
+					volume_images_updated = true;
+				}
+
+				create_volume_mesh(scene, mesh, progress);
+			}
+		}
 	}
+
 	need_flags_update = false;
 }
 
 void MeshManager::device_update_displacement_images(Device *device,
-                                                    DeviceScene *dscene,
                                                     Scene *scene,
                                                     Progress& progress)
 {
@@ -1925,16 +1993,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 					if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
 						continue;
 					}
-					if(device->info.pack_images) {
-						/* If device requires packed images we need to update all
-						 * images now, even if they're not used for displacement.
-						 */
-						image_manager->device_update(device,
-						                             dscene,
-						                             scene,
-						                             progress);
-						return;
-					}
+
 					ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node);
 					int slot = image_node->slot;
 					if(slot != -1) {
@@ -1948,7 +2007,6 @@ void MeshManager::device_update_displacement_images(Device *device,
 		pool.push(function_bind(&ImageManager::device_update_slot,
 		                        image_manager,
 		                        device,
-		                        dscene,
 		                        scene,
 		                        slot,
 		                        &progress));
@@ -1956,6 +2014,44 @@ void MeshManager::device_update_displacement_images(Device *device,
 	pool.wait_work();
 }
 
+void MeshManager::device_update_volume_images(Device *device,
+											  Scene *scene,
+											  Progress& progress)
+{
+	progress.set_status("Updating Volume Images");
+	TaskPool pool;
+	ImageManager *image_manager = scene->image_manager;
+	set<int> volume_images;
+
+	foreach(Mesh *mesh, scene->meshes) {
+		if(!mesh->need_update) {
+			continue;
+		}
+
+		foreach(Attribute& attr, mesh->attributes.attributes) {
+			if(attr.element != ATTR_ELEMENT_VOXEL) {
+				continue;
+			}
+
+			VoxelAttribute *voxel = attr.data_voxel();
+
+			if(voxel->slot != -1) {
+				volume_images.insert(voxel->slot);
+			}
+		}
+	}
+
+	foreach(int slot, volume_images) {
+		pool.push(function_bind(&ImageManager::device_update_slot,
+								image_manager,
+								device,
+								scene,
+								slot,
+								&progress));
+	}
+	pool.wait_work();
+}
+
 void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	if(!need_update)
@@ -1963,14 +2059,17 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
 
-	/* Update normals. */
+	bool true_displacement_used = false;
+	size_t total_tess_needed = 0;
+
 	foreach(Mesh *mesh, scene->meshes) {
 		foreach(Shader *shader, mesh->used_shaders) {
-			if(shader->need_update_attributes)
+			if(shader->need_update_mesh)
 				mesh->need_update = true;
 		}
 
 		if(mesh->need_update) {
+			/* Update normals. */
 			mesh->add_face_normals();
 			mesh->add_vertex_normals();
 
@@ -1978,60 +2077,56 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 				mesh->add_undisplaced();
 			}
 
+			/* Test if we need tesselation. */
+			if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
+			   mesh->num_subd_verts == 0 &&
+			   mesh->subd_params)
+			{
+				total_tess_needed++;
+			}
+
+			/* Test if we need displacement. */
+			if(mesh->has_true_displacement()) {
+				true_displacement_used = true;
+			}
+
 			if(progress.get_cancel()) return;
 		}
 	}
 
 	/* Tessellate meshes that are using subdivision */
-	size_t total_tess_needed = 0;
-	foreach(Mesh *mesh, scene->meshes) {
-		if(mesh->need_update &&
-		   mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
-		   mesh->num_subd_verts == 0 &&
-		   mesh->subd_params)
-		{
-			total_tess_needed++;
-		}
-	}
+	if(total_tess_needed) {
+		size_t i = 0;
+		foreach(Mesh *mesh, scene->meshes) {
+			if(mesh->need_update &&
+			   mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
+			   mesh->num_subd_verts == 0 &&
+			   mesh->subd_params)
+			{
+				string msg = "Tessellating ";
+				if(mesh->name == "")
+					msg += string_printf("%u/%u", (uint)(i+1), (uint)total_tess_needed);
+				else
+					msg += string_printf("%s %u/%u", mesh->name.c_str(), (uint)(i+1), (uint)total_tess_needed);
 
-	size_t i = 0;
-	foreach(Mesh *mesh, scene->meshes) {
-		if(mesh->need_update &&
-		   mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
-		   mesh->num_subd_verts == 0 &&
-		   mesh->subd_params)
-		{
-			string msg = "Tessellating ";
-			if(mesh->name == "")
-				msg += string_printf("%u/%u", (uint)(i+1), (uint)total_tess_needed);
-			else
-				msg += string_printf("%s %u/%u", mesh->name.c_str(), (uint)(i+1), (uint)total_tess_needed);
+				progress.set_status("Updating Mesh", msg);
 
-			progress.set_status("Updating Mesh", msg);
+				DiagSplit dsplit(*mesh->subd_params);
+				mesh->tessellate(&dsplit);
 
-			DiagSplit dsplit(*mesh->subd_params);
-			mesh->tessellate(&dsplit);
+				i++;
 
-			i++;
+				if(progress.get_cancel()) return;
+			}
 
-			if(progress.get_cancel()) return;
 		}
 	}
 
 	/* Update images needed for true displacement. */
-	bool true_displacement_used = false;
 	bool old_need_object_flags_update = false;
-	foreach(Mesh *mesh, scene->meshes) {
-		if(mesh->need_update &&
-		   mesh->has_true_displacement())
-		{
-			true_displacement_used = true;
-			break;
-		}
-	}
 	if(true_displacement_used) {
 		VLOG(1) << "Updating images used for true displacement.";
-		device_update_displacement_images(device, dscene, scene, progress);
+		device_update_displacement_images(device, scene, progress);
 		old_need_object_flags_update = scene->object_manager->need_flags_update;
 		scene->object_manager->device_update_flags(device,
 		                                           dscene,
@@ -2049,20 +2144,22 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	}
 	if(progress.get_cancel()) return;
 
-	/* after mesh data has been copied to device memory we need to update
-	 * offsets for patch tables as this can't be known before hand */
-	scene->object_manager->device_update_patch_map_offsets(device, dscene, scene);
-
 	device_update_attributes(device, dscene, scene, progress);
 	if(progress.get_cancel()) return;
 
 	/* Update displacement. */
 	bool displacement_done = false;
+	size_t num_bvh = 0;
+
 	foreach(Mesh *mesh, scene->meshes) {
-		if(mesh->need_update &&
-		   displace(device, dscene, scene, mesh, progress))
-		{
-			displacement_done = true;
+		if(mesh->need_update) {
+			if(displace(device, dscene, scene, mesh, progress)) {
+				displacement_done = true;
+			}
+
+			if(mesh->need_build_bvh()) {
+				num_bvh++;
+			}
 		}
 	}
 
@@ -2077,21 +2174,14 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 		if(progress.get_cancel()) return;
 	}
 
-	/* Update bvh. */
-	size_t num_bvh = 0;
-	foreach(Mesh *mesh, scene->meshes) {
-		if(mesh->need_update && mesh->need_build_bvh()) {
-			num_bvh++;
-		}
-	}
-
 	TaskPool pool;
 
-	i = 0;
+	size_t i = 0;
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update) {
 			pool.push(function_bind(&Mesh::compute_bvh,
 			                        mesh,
+			                        device,
 			                        dscene,
 			                        &scene->params,
 			                        &progress,
@@ -2109,15 +2199,11 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	        << summary.full_report();
 
 	foreach(Shader *shader, scene->shaders) {
-		shader->need_update_attributes = false;
+		shader->need_update_mesh = false;
 	}
 
-#ifdef __OBJECT_MOTION__
-	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
+	Scene::MotionType need_motion = scene->need_motion();
 	bool motion_blur = need_motion == Scene::MOTION_BLUR;
-#else
-	bool motion_blur = false;
-#endif
 
 	/* Update objects. */
 	vector<Object *> volume_objects;
@@ -2148,50 +2234,28 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 void MeshManager::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->bvh_nodes);
-	device->tex_free(dscene->bvh_leaf_nodes);
-	device->tex_free(dscene->object_node);
-	device->tex_free(dscene->prim_tri_verts);
-	device->tex_free(dscene->prim_tri_index);
-	device->tex_free(dscene->prim_type);
-	device->tex_free(dscene->prim_visibility);
-	device->tex_free(dscene->prim_index);
-	device->tex_free(dscene->prim_object);
-	device->tex_free(dscene->prim_time);
-	device->tex_free(dscene->tri_shader);
-	device->tex_free(dscene->tri_vnormal);
-	device->tex_free(dscene->tri_vindex);
-	device->tex_free(dscene->tri_patch);
-	device->tex_free(dscene->tri_patch_uv);
-	device->tex_free(dscene->curves);
-	device->tex_free(dscene->curve_keys);
-	device->tex_free(dscene->patches);
-	device->tex_free(dscene->attributes_map);
-	device->tex_free(dscene->attributes_float);
-	device->tex_free(dscene->attributes_float3);
-	device->tex_free(dscene->attributes_uchar4);
-
-	dscene->bvh_nodes.clear();
-	dscene->object_node.clear();
-	dscene->prim_tri_verts.clear();
-	dscene->prim_tri_index.clear();
-	dscene->prim_type.clear();
-	dscene->prim_visibility.clear();
-	dscene->prim_index.clear();
-	dscene->prim_object.clear();
-	dscene->prim_time.clear();
-	dscene->tri_shader.clear();
-	dscene->tri_vnormal.clear();
-	dscene->tri_vindex.clear();
-	dscene->tri_patch.clear();
-	dscene->tri_patch_uv.clear();
-	dscene->curves.clear();
-	dscene->curve_keys.clear();
-	dscene->patches.clear();
-	dscene->attributes_map.clear();
-	dscene->attributes_float.clear();
-	dscene->attributes_float3.clear();
-	dscene->attributes_uchar4.clear();
+	dscene->bvh_nodes.free();
+	dscene->bvh_leaf_nodes.free();
+	dscene->object_node.free();
+	dscene->prim_tri_verts.free();
+	dscene->prim_tri_index.free();
+	dscene->prim_type.free();
+	dscene->prim_visibility.free();
+	dscene->prim_index.free();
+	dscene->prim_object.free();
+	dscene->prim_time.free();
+	dscene->tri_shader.free();
+	dscene->tri_vnormal.free();
+	dscene->tri_vindex.free();
+	dscene->tri_patch.free();
+	dscene->tri_patch_uv.free();
+	dscene->curves.free();
+	dscene->curve_keys.free();
+	dscene->patches.free();
+	dscene->attributes_map.free();
+	dscene->attributes_float.free();
+	dscene->attributes_float3.free();
+	dscene->attributes_uchar4.free();
 
 #ifdef WITH_OSL
 	OSLGlobals *og = (OSLGlobals*)device->osl_memory();
@@ -2201,6 +2265,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 		og->attribute_map.clear();
 		og->object_names.clear();
 	}
+#else
+	(void)device;
 #endif
 }
 
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 5f33e30eac2..e370f8a2021 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -17,17 +17,18 @@
 #ifndef __MESH_H__
 #define __MESH_H__
 
-#include "attribute.h"
-#include "node.h"
-#include "shader.h"
-
-#include "util_boundbox.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "graph/node.h"
+
+#include "render/attribute.h"
+#include "render/shader.h"
+
+#include "util/util_boundbox.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,7 +49,7 @@ struct PackedPatchTable;
 
 class Mesh : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Mesh Triangle */
 	struct Triangle {
@@ -69,6 +70,10 @@ public:
 		                    size_t num_steps,
 		                    size_t step,
 		                    float3 r_verts[3]) const;
+
+		float3 compute_normal(const float3 *verts) const;
+
+		bool valid(const float3 *verts) const;
 	};
 
 	Triangle get_triangle(size_t i) const
@@ -197,7 +202,8 @@ public:
 	array<int> triangle_patch; /* must be < 0 for non subd triangles */
 	array<float2> vert_patch_uv;
 
-	bool has_volume;  /* Set in the device_update_flags(). */
+	float volume_isovalue;
+	bool has_volume;          /* Set in the device_update_flags(). */
 	bool has_surface_bssrdf;  /* Set in the device_update_flags(). */
 
 	array<float3> curve_keys;
@@ -245,6 +251,8 @@ public:
 	size_t face_offset;
 	size_t corner_offset;
 
+	size_t attr_map_offset;
+
 	size_t num_subd_verts;
 
 	/* Functions */
@@ -257,7 +265,7 @@ public:
 	void reserve_curves(int numcurves, int numkeys);
 	void resize_subd_faces(int numfaces, int num_ngons, int numcorners);
 	void reserve_subd_faces(int numfaces, int num_ngons, int numcorners);
-	void clear();
+	void clear(bool preserve_voxel_data = false);
 	void add_vertex(float3 P);
 	void add_vertex_slow(float3 P);
 	void add_triangle(int v0, int v1, int v2, int shader, bool smooth);
@@ -271,7 +279,8 @@ public:
 	void add_vertex_normals();
 	void add_undisplaced();
 
-	void pack_normals(Scene *scene, uint *shader, float4 *vnormal);
+	void pack_shaders(Scene *scene, uint *shader);
+	void pack_normals(float4 *vnormal);
 	void pack_verts(const vector<uint>& tri_prim_index,
 	                uint4 *tri_vindex,
 	                uint *tri_patch,
@@ -281,7 +290,8 @@ public:
 	void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
 	void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset);
 
-	void compute_bvh(DeviceScene *dscene,
+	void compute_bvh(Device *device,
+	                 DeviceScene *dscene,
 	                 SceneParams *params,
 	                 Progress *progress,
 	                 int n,
@@ -295,6 +305,11 @@ public:
 	bool has_motion_blur() const;
 	bool has_true_displacement() const;
 
+	/* Convert between normalized -1..1 motion time and index
+	 * in the VERTEX_MOTION attribute. */
+	float motion_time(int step) const;
+	int motion_step(float time) const;
+
 	/* Check whether the mesh should have own BVH built separately. Briefly,
 	 * own BVH is needed for mesh, if:
 	 *
@@ -315,8 +330,6 @@ public:
 
 class MeshManager {
 public:
-	BVH *bvh;
-
 	bool need_update;
 	bool need_flags_update;
 
@@ -329,13 +342,15 @@ public:
 	void update_osl_attributes(Device *device, Scene *scene, vector<AttributeRequestSet>& mesh_attributes);
 	void update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes);
 
+	void device_update_preprocess(Device *device, Scene *scene, Progress& progress);
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
 
+	void create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress);
+
 protected:
 	/* Calculate verts/triangles/curves offsets in global arrays. */
 	void mesh_calc_offset(Scene *scene);
@@ -362,9 +377,12 @@ protected:
 	                       Progress& progress);
 
 	void device_update_displacement_images(Device *device,
-	                                       DeviceScene *dscene,
 	                                       Scene *scene,
 	                                       Progress& progress);
+
+	void device_update_volume_images(Device *device,
+									 Scene *scene,
+									 Progress& progress);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index adc5b820298..ad2a5713bcb 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
 
-#include "util_foreach.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -64,8 +64,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	/* setup input for device task */
 	const size_t num_verts = mesh->verts.size();
 	vector<bool> done(num_verts, false);
-	device_vector<uint4> d_input;
-	uint4 *d_input_data = d_input.resize(num_verts);
+	device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY);
+	uint4 *d_input_data = d_input.alloc(num_verts);
 	size_t d_input_size = 0;
 
 	size_t num_triangles = mesh->num_triangles();
@@ -115,16 +115,14 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 		return false;
 	
 	/* run device task */
-	device_vector<float4> d_output;
-	d_output.resize(d_input_size);
+	device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE);
+	d_output.alloc(d_input_size);
+	d_output.zero_to_device();
+	d_input.copy_to_device();
 
 	/* needs to be up to data for attribute access */
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
-	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
-
 	DeviceTask task(DeviceTask::SHADER);
 	task.shader_input = d_input.device_pointer;
 	task.shader_output = d_output.device_pointer;
@@ -138,21 +136,20 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	device->task_wait();
 
 	if(progress.get_cancel()) {
-		device->mem_free(d_input);
-		device->mem_free(d_output);
+		d_input.free();
+		d_output.free();
 		return false;
 	}
 
-	device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
-	device->mem_free(d_input);
-	device->mem_free(d_output);
+	d_output.copy_from_device(0, 1, d_output.size());
+	d_input.free();
 
 	/* read result */
 	done.clear();
 	done.resize(num_verts, false);
 	int k = 0;
 
-	float4 *offset = (float4*)d_output.data_pointer;
+	float4 *offset = d_output.data();
 
 	Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 	for(size_t i = 0; i < num_triangles; i++) {
@@ -169,6 +166,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 			if(!done[t.v[j]]) {
 				done[t.v[j]] = true;
 				float3 off = float4_to_float3(offset[k++]);
+				/* Avoid illegal vertex coordinates. */
+				off = ensure_finite3(off);
 				mesh->verts[t.v[j]] += off;
 				if(attr_mP != NULL) {
 					for(int step = 0; step < mesh->motion_steps - 1; step++) {
@@ -180,6 +179,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 		}
 	}
 
+	d_output.free();
+
 	/* for displacement method both, we only need to recompute the face
 	 * normals, as bump mapping in the shader will already alter the
 	 * vertex normal, so we start from the non-displaced vertex normals
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 57c76a9f1c8..9dd81eb6700 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "attribute.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
+#include "render/camera.h"
 
-#include "subd_split.h"
-#include "subd_patch.h"
-#include "subd_patch_table.h"
+#include "subd/subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_patch_table.h"
 
-#include "util_foreach.h"
-#include "util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -204,7 +204,9 @@ public:
 			src = dest;
 		}
 
-		patch_table->ComputeLocalPointValues(&verts[0], &verts[num_refiner_verts]);
+		if(num_local_points) {
+			patch_table->ComputeLocalPointValues(&verts[0], &verts[num_refiner_verts]);
+		}
 
 		/* create patch map */
 		patch_map = new Far::PatchMap(*patch_table);
@@ -236,13 +238,15 @@ public:
 				src = dest;
 			}
 
-			if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) {
-				patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0],
-					                                 (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
-			}
-			else {
-				patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0],
-					                                 (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+			if(num_local_points) {
+				if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) {
+					patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0],
+							                             (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+				}
+				else {
+					patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0],
+							                             (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+				}
 			}
 		}
 		else if(attr.element == ATTR_ELEMENT_CORNER || attr.element == ATTR_ELEMENT_CORNER_BYTE) {
diff --git a/intern/cycles/render/mesh_volume.cpp b/intern/cycles/render/mesh_volume.cpp
new file mode 100644
index 00000000000..d1c49b456ff
--- /dev/null
+++ b/intern/cycles/render/mesh_volume.cpp
@@ -0,0 +1,546 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/mesh.h"
+#include "render/attribute.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+static size_t compute_voxel_index(const int3 &resolution, size_t x, size_t y, size_t z)
+{
+	if(x == -1 || x >= resolution.x) {
+		return -1;
+	}
+
+	if(y == -1 || y >= resolution.y) {
+		return -1;
+	}
+
+	if(z == -1 || z >= resolution.z) {
+		return -1;
+	}
+
+	return x + y*resolution.x + z*resolution.x*resolution.y;
+}
+
+struct QuadData {
+	int v0, v1, v2, v3;
+
+	float3 normal;
+};
+
+enum {
+	QUAD_X_MIN = 0,
+	QUAD_X_MAX = 1,
+	QUAD_Y_MIN = 2,
+	QUAD_Y_MAX = 3,
+	QUAD_Z_MIN = 4,
+	QUAD_Z_MAX = 5,
+};
+
+const int quads_indices[6][4] = {
+	/* QUAD_X_MIN */
+	{ 4, 0, 3, 7 },
+	/* QUAD_X_MAX */
+	{ 1, 5, 6, 2 },
+	/* QUAD_Y_MIN */
+	{ 4, 5, 1, 0 },
+	/* QUAD_Y_MAX */
+	{ 3, 2, 6, 7 },
+	/* QUAD_Z_MIN */
+	{ 0, 1, 2, 3 },
+	/* QUAD_Z_MAX */
+	{ 5, 4, 7, 6 },
+};
+
+const float3 quads_normals[6] = {
+	/* QUAD_X_MIN */
+	make_float3(-1.0f, 0.0f, 0.0f),
+	/* QUAD_X_MAX */
+	make_float3(1.0f, 0.0f, 0.0f),
+	/* QUAD_Y_MIN */
+	make_float3(0.0f, -1.0f, 0.0f),
+	/* QUAD_Y_MAX */
+	make_float3(0.0f, 1.0f, 0.0f),
+	/* QUAD_Z_MIN */
+	make_float3(0.0f, 0.0f, -1.0f),
+	/* QUAD_Z_MAX */
+	make_float3(0.0f, 0.0f, 1.0f),
+};
+
+static void create_quad(int3 corners[8], vector<int3> &vertices, vector<QuadData> &quads, int face_index)
+{
+	size_t vertex_offset = vertices.size();
+
+	QuadData quad;
+	quad.v0 = vertex_offset + 0;
+	quad.v1 = vertex_offset + 1;
+	quad.v2 = vertex_offset + 2;
+	quad.v3 = vertex_offset + 3;
+	quad.normal = quads_normals[face_index];
+
+	quads.push_back(quad);
+
+	vertices.push_back(corners[quads_indices[face_index][0]]);
+	vertices.push_back(corners[quads_indices[face_index][1]]);
+	vertices.push_back(corners[quads_indices[face_index][2]]);
+	vertices.push_back(corners[quads_indices[face_index][3]]);
+}
+
+struct VolumeParams {
+	int3 resolution;
+	float3 cell_size;
+	float3 start_point;
+	int pad_size;
+};
+
+static const int CUBE_SIZE = 8;
+
+/* Create a mesh from a volume.
+ *
+ * The way the algorithm works is as follows:
+ *
+ * - the coordinates of active voxels from a dense volume (or 3d image) are
+ * gathered inside an auxialliary volume.
+ * - each set of coordinates of an CUBE_SIZE cube are mapped to the same
+ * coordinate of the auxilliary volume.
+ * - quads are created between active and non-active voxels in the auxialliary
+ * volume to generate a tight mesh around the volume.
+ */
+class VolumeMeshBuilder {
+	/* Auxilliary volume that is used to check if a node already added. */
+	vector<char> grid;
+
+	/* The resolution of the auxilliary volume, set to be equal to 1/CUBE_SIZE
+	 * of the original volume on each axis. */
+	int3 res;
+
+	size_t number_of_nodes;
+
+	/* Offset due to padding in the original grid. Padding will transform the
+	 * coordinates of the original grid from 0...res to -padding...res+padding,
+	 * so some coordinates are negative, and we need to properly account for
+	 * them. */
+	int3 pad_offset;
+
+	VolumeParams *params;
+
+public:
+	VolumeMeshBuilder(VolumeParams *volume_params);
+
+	void add_node(int x, int y, int z);
+
+	void add_node_with_padding(int x, int y, int z);
+
+	void create_mesh(vector<float3> &vertices,
+	                 vector<int> &indices,
+	                 vector<float3> &face_normals);
+
+private:
+	void generate_vertices_and_quads(vector<int3> &vertices_is,
+	                                 vector<QuadData> &quads);
+
+	void deduplicate_vertices(vector<int3> &vertices,
+	                          vector<QuadData> &quads);
+
+	void convert_object_space(const vector<int3> &vertices,
+	                          vector<float3> &out_vertices);
+
+	void convert_quads_to_tris(const vector<QuadData> &quads,
+	                           vector<int> &tris,
+	                           vector<float3> &face_normals);
+};
+
+VolumeMeshBuilder::VolumeMeshBuilder(VolumeParams *volume_params)
+{
+	params = volume_params;
+	number_of_nodes = 0;
+
+	const size_t x = divide_up(params->resolution.x, CUBE_SIZE);
+	const size_t y = divide_up(params->resolution.y, CUBE_SIZE);
+	const size_t z = divide_up(params->resolution.z, CUBE_SIZE);
+
+	/* Adding 2*pad_size since we pad in both positive and negative directions
+	 * along the axis. */
+	const size_t px = divide_up(params->resolution.x + 2*params->pad_size, CUBE_SIZE);
+	const size_t py = divide_up(params->resolution.y + 2*params->pad_size, CUBE_SIZE);
+	const size_t pz = divide_up(params->resolution.z + 2*params->pad_size, CUBE_SIZE);
+
+	res = make_int3(px, py, pz);
+	pad_offset = make_int3(px - x, py - y, pz - z);
+
+	grid.resize(px*py*pz, 0);
+}
+
+void VolumeMeshBuilder::add_node(int x, int y, int z)
+{
+	/* Map coordinates to index space. */
+	const int index_x = (x/CUBE_SIZE) + pad_offset.x;
+	const int index_y = (y/CUBE_SIZE) + pad_offset.y;
+	const int index_z = (z/CUBE_SIZE) + pad_offset.z;
+
+	assert((index_x >= 0) && (index_y >= 0) && (index_z >= 0));
+
+	const size_t index = compute_voxel_index(res, index_x, index_y, index_z);
+
+	/* We already have a node here. */
+	if(grid[index] == 1) {
+		return;
+	}
+
+	++number_of_nodes;
+
+	grid[index] = 1;
+}
+
+void VolumeMeshBuilder::add_node_with_padding(int x, int y, int z)
+{
+	for(int px = x - params->pad_size; px < x + params->pad_size; ++px) {
+		for(int py = y - params->pad_size; py < y + params->pad_size; ++py) {
+			for(int pz = z - params->pad_size; pz < z + params->pad_size; ++pz) {
+				add_node(px, py, pz);
+			}
+		}
+	}
+}
+
+void VolumeMeshBuilder::create_mesh(vector<float3> &vertices,
+                                    vector<int> &indices,
+                                    vector<float3> &face_normals)
+{
+	/* We create vertices in index space (is), and only convert them to object
+	 * space when done. */
+	vector<int3> vertices_is;
+	vector<QuadData> quads;
+
+	generate_vertices_and_quads(vertices_is, quads);
+
+	deduplicate_vertices(vertices_is, quads);
+
+	convert_object_space(vertices_is, vertices);
+
+	convert_quads_to_tris(quads, indices, face_normals);
+}
+
+void VolumeMeshBuilder::generate_vertices_and_quads(
+		vector<ccl::int3> &vertices_is,
+		vector<QuadData> &quads)
+{
+	/* Overallocation, we could count the number of quads and vertices to create
+	 * in a pre-pass if memory becomes an issue. */
+	vertices_is.reserve(number_of_nodes*8);
+	quads.reserve(number_of_nodes*6);
+
+	for(int z = 0; z < res.z; ++z) {
+		for(int y = 0; y < res.y; ++y) {
+			for(int x = 0; x < res.x; ++x) {
+				size_t voxel_index = compute_voxel_index(res, x, y, z);
+				if(grid[voxel_index] == 0) {
+					continue;
+				}
+
+				/* Compute min and max coords of the node in index space. */
+				int3 min = make_int3((x - pad_offset.x)*CUBE_SIZE,
+				                     (y - pad_offset.y)*CUBE_SIZE,
+				                     (z - pad_offset.z)*CUBE_SIZE);
+
+				/* Maximum is just CUBE_SIZE voxels away from minimum on each axis. */
+				int3 max = make_int3(min.x + CUBE_SIZE, min.y + CUBE_SIZE, min.z + CUBE_SIZE);
+
+				int3 corners[8] = {
+					make_int3(min[0], min[1], min[2]),
+					make_int3(max[0], min[1], min[2]),
+					make_int3(max[0], max[1], min[2]),
+					make_int3(min[0], max[1], min[2]),
+					make_int3(min[0], min[1], max[2]),
+					make_int3(max[0], min[1], max[2]),
+					make_int3(max[0], max[1], max[2]),
+					make_int3(min[0], max[1], max[2]),
+				};
+
+				/* Only create a quad if on the border between an active and
+				 * an inactive node.
+				 */
+
+				voxel_index = compute_voxel_index(res, x - 1, y, z);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_X_MIN);
+				}
+
+				voxel_index = compute_voxel_index(res, x + 1, y, z);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_X_MAX);
+				}
+
+				voxel_index = compute_voxel_index(res, x, y - 1, z);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_Y_MIN);
+				}
+
+				voxel_index = compute_voxel_index(res, x, y + 1, z);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_Y_MAX);
+				}
+
+				voxel_index = compute_voxel_index(res, x, y, z - 1);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_Z_MIN);
+				}
+
+				voxel_index = compute_voxel_index(res, x, y, z + 1);
+				if(voxel_index == -1 || grid[voxel_index] == 0) {
+					create_quad(corners, vertices_is, quads, QUAD_Z_MAX);
+				}
+			}
+		}
+	}
+}
+
+void VolumeMeshBuilder::deduplicate_vertices(vector<int3> &vertices,
+                                             vector<QuadData> &quads)
+{
+	vector<int3> sorted_vertices = vertices;
+	std::sort(sorted_vertices.begin(), sorted_vertices.end());
+	vector<int3>::iterator it = std::unique(sorted_vertices.begin(), sorted_vertices.end());
+	sorted_vertices.resize(std::distance(sorted_vertices.begin(), it));
+
+	vector<QuadData> new_quads = quads;
+
+	for(size_t i = 0; i < vertices.size(); ++i) {
+		for(size_t j = 0; j < sorted_vertices.size(); ++j) {
+			if(vertices[i] != sorted_vertices[j]) {
+				continue;
+			}
+
+			for(int k = 0; k < quads.size(); ++k) {
+				if(quads[k].v0 == i) {
+					new_quads[k].v0 = j;
+				}
+				else if(quads[k].v1 == i) {
+					new_quads[k].v1 = j;
+				}
+				else if(quads[k].v2 == i) {
+					new_quads[k].v2 = j;
+				}
+				else if(quads[k].v3 == i) {
+					new_quads[k].v3 = j;
+				}
+			}
+
+			break;
+		}
+	}
+
+	vertices = sorted_vertices;
+	quads = new_quads;
+}
+
+void VolumeMeshBuilder::convert_object_space(const vector<int3> &vertices,
+	                                         vector<float3> &out_vertices)
+{
+	out_vertices.reserve(vertices.size());
+
+	for(size_t i = 0; i < vertices.size(); ++i) {
+		float3 vertex = make_float3(vertices[i].x, vertices[i].y, vertices[i].z);
+		vertex *= params->cell_size;
+		vertex += params->start_point;
+
+		out_vertices.push_back(vertex);
+	}
+}
+
+void VolumeMeshBuilder::convert_quads_to_tris(const vector<QuadData> &quads,
+                                              vector<int> &tris,
+                                              vector<float3> &face_normals)
+{
+	int index_offset = 0;
+	tris.resize(quads.size()*6);
+	face_normals.reserve(quads.size()*2);
+
+	for(size_t i = 0; i < quads.size(); ++i) {
+		tris[index_offset++] = quads[i].v0;
+		tris[index_offset++] = quads[i].v2;
+		tris[index_offset++] = quads[i].v1;
+
+		face_normals.push_back(quads[i].normal);
+
+		tris[index_offset++] = quads[i].v0;
+		tris[index_offset++] = quads[i].v3;
+		tris[index_offset++] = quads[i].v2;
+
+		face_normals.push_back(quads[i].normal);
+	}
+}
+
+/* ************************************************************************** */
+
+struct VoxelAttributeGrid {
+	float *data;
+	int channels;
+};
+
+void MeshManager::create_volume_mesh(Scene *scene,
+                                     Mesh *mesh,
+                                     Progress& progress)
+{
+	string msg = string_printf("Computing Volume Mesh %s", mesh->name.c_str());
+	progress.set_status("Updating Mesh", msg);
+
+	vector<VoxelAttributeGrid> voxel_grids;
+
+	/* Compute volume parameters. */
+	VolumeParams volume_params;
+	volume_params.resolution = make_int3(0, 0, 0);
+
+	foreach(Attribute& attr, mesh->attributes.attributes) {
+		if(attr.element != ATTR_ELEMENT_VOXEL) {
+			continue;
+		}
+
+		VoxelAttribute *voxel = attr.data_voxel();
+		device_memory *image_memory = scene->image_manager->image_memory(voxel->slot);
+		int3 resolution = make_int3(image_memory->data_width,
+                                    image_memory->data_height,
+		                            image_memory->data_depth);
+
+		if(volume_params.resolution == make_int3(0, 0, 0)) {
+			volume_params.resolution = resolution;
+		}
+		else if(volume_params.resolution != resolution) {
+			VLOG(1) << "Can't create volume mesh, all voxel grid resolutions must be equal\n";
+			return;
+		}
+
+		VoxelAttributeGrid voxel_grid;
+		voxel_grid.data = static_cast<float*>(image_memory->host_pointer);
+		voxel_grid.channels = image_memory->data_elements;
+		voxel_grids.push_back(voxel_grid);
+	}
+
+	if(voxel_grids.empty()) {
+		return;
+	}
+
+	/* Compute padding. */
+	Shader *volume_shader = NULL;
+	int pad_size = 0;
+
+	foreach(Shader *shader, mesh->used_shaders) {
+		if(!shader->has_volume) {
+			continue;
+		}
+
+		volume_shader = shader;
+
+		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_LINEAR) {
+			pad_size = max(1, pad_size);
+		}
+		else if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC) {
+			pad_size = max(2, pad_size);
+		}
+
+		break;
+	}
+
+	if(!volume_shader) {
+		return;
+	}
+
+	/* Compute start point and cell size from transform. */
+	Attribute *attr = mesh->attributes.find(ATTR_STD_GENERATED_TRANSFORM);
+	const int3 resolution = volume_params.resolution;
+	float3 start_point = make_float3(0.0f, 0.0f, 0.0f);
+	float3 cell_size = make_float3(1.0f/resolution.x,
+	                               1.0f/resolution.y,
+	                               1.0f/resolution.z);
+
+	if(attr) {
+		const Transform *tfm = attr->data_transform();
+		const Transform itfm = transform_inverse(*tfm);
+		start_point = transform_point(&itfm, start_point);
+		cell_size = transform_direction(&itfm, cell_size);
+	}
+
+	volume_params.start_point = start_point;
+	volume_params.cell_size = cell_size;
+	volume_params.pad_size = pad_size;
+
+	/* Build bounding mesh around non-empty volume cells. */
+	VolumeMeshBuilder builder(&volume_params);
+	const float isovalue = mesh->volume_isovalue;
+
+	for(int z = 0; z < resolution.z; ++z) {
+		for(int y = 0; y < resolution.y; ++y) {
+			for(int x = 0; x < resolution.x; ++x) {
+				size_t voxel_index = compute_voxel_index(resolution, x, y, z);
+
+				for(size_t i = 0; i < voxel_grids.size(); ++i) {
+					const VoxelAttributeGrid &voxel_grid = voxel_grids[i];
+					const int channels = voxel_grid.channels;
+
+					for(int c = 0; c < channels; c++) {
+						if(voxel_grid.data[voxel_index * channels + c] >= isovalue) {
+							builder.add_node_with_padding(x, y, z);
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+
+	/* Create mesh. */
+	vector<float3> vertices;
+	vector<int> indices;
+	vector<float3> face_normals;
+	builder.create_mesh(vertices, indices, face_normals);
+
+	mesh->clear(true);
+	mesh->reserve_mesh(vertices.size(), indices.size()/3);
+	mesh->used_shaders.push_back(volume_shader);
+
+	for(size_t i = 0; i < vertices.size(); ++i) {
+		mesh->add_vertex(vertices[i]);
+	}
+
+	for(size_t i = 0; i < indices.size(); i += 3) {
+		mesh->add_triangle(indices[i], indices[i + 1], indices[i + 2], 0, false);
+	}
+
+	Attribute *attr_fN = mesh->attributes.add(ATTR_STD_FACE_NORMAL);
+	float3 *fN = attr_fN->data_float3();
+
+	for(size_t i = 0; i < face_normals.size(); ++i) {
+		fN[i] = face_normals[i];
+	}
+
+	/* Print stats. */
+	VLOG(1) << "Memory usage volume mesh: "
+	        << ((vertices.size() + face_normals.size())*sizeof(float3) + indices.size()*sizeof(int))/(1024.0*1024.0)
+	        << "Mb.";
+
+	VLOG(1) << "Memory usage volume grid: "
+	        << (resolution.x*resolution.y*resolution.z*sizeof(float))/(1024.0*1024.0)
+	        << "Mb.";
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 13b149eddfa..01399c85bc0 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "integrator.h"
-#include "nodes.h"
-#include "scene.h"
-#include "svm.h"
-#include "svm_color_util.h"
-#include "svm_ramp_util.h"
-#include "svm_math_util.h"
-#include "osl.h"
-#include "constant_fold.h"
-
-#include "util_sky_model.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "render/image.h"
+#include "render/integrator.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/svm.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_ramp_util.h"
+#include "kernel/svm/svm_math_util.h"
+#include "render/osl.h"
+#include "render/constant_fold.h"
+
+#include "util/util_sky_model.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -116,8 +117,7 @@ Transform TextureMapping::compute_transform()
 		case NORMAL:
 			/* no translation for normals, and inverse transpose */
 			mat = rmat*smat;
-			mat = transform_inverse(mat);
-			mat = transform_transpose(mat);
+			mat = transform_transposed_inverse(mat);
 			break;
 	}
 
@@ -152,7 +152,6 @@ void TextureMapping::compile(SVMCompiler& compiler, int offset_in, int offset_ou
 	compiler.add_node(tfm.x);
 	compiler.add_node(tfm.y);
 	compiler.add_node(tfm.z);
-	compiler.add_node(tfm.w);
 
 	if(use_minmax) {
 		compiler.add_node(NODE_MIN_MAX, offset_out, offset_out);
@@ -192,9 +191,7 @@ void TextureMapping::compile_end(SVMCompiler& compiler, ShaderInput *vector_in,
 void TextureMapping::compile(OSLCompiler &compiler)
 {
 	if(!skip()) {
-		Transform tfm = transform_transpose(compute_transform());
-
-		compiler.parameter("mapping", tfm);
+		compiler.parameter("mapping", compute_transform());
 		compiler.parameter("use_mapping", 1);
 	}
 }
@@ -207,7 +204,7 @@ NODE_DEFINE(ImageTextureNode)
 
 	TEXTURE_MAPPING_DEFINE(ImageTextureNode);
 
-	SOCKET_STRING(filename, "Filename", ustring(""));
+	SOCKET_STRING(filename, "Filename", ustring());
 
 	static NodeEnum color_space_enum;
 	color_space_enum.insert("none", NODE_COLOR_SPACE_NONE);
@@ -301,17 +298,17 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
 
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
-		bool is_float_bool;
+		ImageMetaData metadata;
 		slot = image_manager->add_image(filename.string(),
 		                                builtin_data,
 		                                animated,
 		                                0,
-		                                is_float_bool,
-		                                is_linear,
 		                                interpolation,
 		                                extension,
-		                                use_alpha);
-		is_float = (int)is_float_bool;
+		                                use_alpha,
+		                                metadata);
+		is_float = metadata.is_float;
+		is_linear = metadata.is_linear;
 	}
 
 	if(slot != -1) {
@@ -362,25 +359,22 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
+		ImageMetaData metadata;
 		if(builtin_data == NULL) {
-			ImageManager::ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
-			if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
-				is_float = 1;
+			image_manager->get_image_metadata(filename.string(), NULL, metadata);
 		}
 		else {
-			bool is_float_bool;
 			slot = image_manager->add_image(filename.string(),
 			                                builtin_data,
 			                                animated,
 			                                0,
-			                                is_float_bool,
-			                                is_linear,
 			                                interpolation,
 			                                extension,
-			                                use_alpha);
-			is_float = (int)is_float_bool;
+			                                use_alpha,
+			                                metadata);
 		}
+		is_float = metadata.is_float;
+		is_linear = metadata.is_linear;
 	}
 
 	if(slot == -1) {
@@ -417,7 +411,7 @@ NODE_DEFINE(EnvironmentTextureNode)
 
 	TEXTURE_MAPPING_DEFINE(EnvironmentTextureNode);
 
-	SOCKET_STRING(filename, "Filename", ustring(""));
+	SOCKET_STRING(filename, "Filename", ustring());
 
 	static NodeEnum color_space_enum;
 	color_space_enum.insert("none", NODE_COLOR_SPACE_NONE);
@@ -499,17 +493,17 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler)
 
 	image_manager = compiler.image_manager;
 	if(slot == -1) {
-		bool is_float_bool;
+		ImageMetaData metadata;
 		slot = image_manager->add_image(filename.string(),
 		                                builtin_data,
 		                                animated,
 		                                0,
-		                                is_float_bool,
-		                                is_linear,
 		                                interpolation,
 		                                EXTENSION_REPEAT,
-		                                use_alpha);
-		is_float = (int)is_float_bool;
+		                                use_alpha,
+		                                metadata);
+		is_float = metadata.is_float;
+		is_linear = metadata.is_linear;
 	}
 
 	if(slot != -1) {
@@ -551,25 +545,22 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
 	 */
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
+		ImageMetaData metadata;
 		if(builtin_data == NULL) {
-			ImageManager::ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
-			if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
-				is_float = 1;
+			image_manager->get_image_metadata(filename.string(), NULL, metadata);
 		}
 		else {
-			bool is_float_bool;
 			slot = image_manager->add_image(filename.string(),
 			                                builtin_data,
 			                                animated,
 			                                0,
-			                                is_float_bool,
-			                                is_linear,
 			                                interpolation,
 			                                EXTENSION_REPEAT,
-			                                use_alpha);
-			is_float = (int)is_float_bool;
+			                                use_alpha,
+			                                metadata);
 		}
+		is_float = metadata.is_float;
+		is_linear = metadata.is_linear;
 	}
 
 	if(slot == -1) {
@@ -1345,7 +1336,7 @@ NODE_DEFINE(PointDensityTextureNode)
 {
 	NodeType* type = NodeType::add("point_density_texture", create, NodeType::SHADER);
 
-	SOCKET_STRING(filename, "Filename", ustring(""));
+	SOCKET_STRING(filename, "Filename", ustring());
 
 	static NodeEnum space_enum;
 	space_enum.insert("object", NODE_TEX_VOXEL_SPACE_OBJECT);
@@ -1418,13 +1409,13 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler)
 
 	if(use_density || use_color) {
 		if(slot == -1) {
-			bool is_float, is_linear;
+			ImageMetaData metadata;
 			slot = image_manager->add_image(filename.string(), builtin_data,
 			                                false, 0,
-			                                is_float, is_linear,
 			                                interpolation,
 			                                EXTENSION_CLIP,
-			                                true);
+			                                true,
+			                                metadata);
 		}
 
 		if(slot != -1) {
@@ -1439,7 +1430,6 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler)
 				compiler.add_node(tfm.x);
 				compiler.add_node(tfm.y);
 				compiler.add_node(tfm.z);
-				compiler.add_node(tfm.w);
 			}
 		}
 		else {
@@ -1470,20 +1460,20 @@ void PointDensityTextureNode::compile(OSLCompiler& compiler)
 
 	if(use_density || use_color) {
 		if(slot == -1) {
-			bool is_float, is_linear;
+			ImageMetaData metadata;
 			slot = image_manager->add_image(filename.string(), builtin_data,
 			                                false, 0,
-			                                is_float, is_linear,
 			                                interpolation,
 			                                EXTENSION_CLIP,
-			                                true);
+			                                true,
+			                                metadata);
 		}
 
 		if(slot != -1) {
 			compiler.parameter("filename", string_printf("@%d", slot).c_str());
 		}
 		if(space == NODE_TEX_VOXEL_SPACE_WORLD) {
-			compiler.parameter("mapping", transform_transpose(tfm));
+			compiler.parameter("mapping", tfm);
 			compiler.parameter("use_mapping", 1);
 		}
 		compiler.parameter(this, "interpolation");
@@ -1563,8 +1553,7 @@ void MappingNode::compile(SVMCompiler& compiler)
 
 void MappingNode::compile(OSLCompiler& compiler)
 {
-	Transform tfm = transform_transpose(tex_mapping.compute_transform());
-	compiler.parameter("Matrix", tfm);
+	compiler.parameter("Matrix", tex_mapping.compute_transform());
 	compiler.parameter_point("mapping_min", tex_mapping.min);
 	compiler.parameter_point("mapping_max", tex_mapping.max);
 	compiler.parameter("use_minmax", tex_mapping.use_minmax);
@@ -1790,12 +1779,27 @@ void ConvertNode::compile(OSLCompiler& compiler)
 		assert(0);
 }
 
+/* Base type for all closure-type nodes */
+
+BsdfBaseNode::BsdfBaseNode(const NodeType *node_type)
+        : ShaderNode(node_type)
+{
+	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
+}
+
+bool BsdfBaseNode::has_bump()
+{
+	/* detect if anything is plugged into the normal input besides the default */
+	ShaderInput *normal_in = input("Normal");
+	return (normal_in && normal_in->link &&
+	        normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY);
+}
+
 /* BSDF Closure */
 
 BsdfNode::BsdfNode(const NodeType *node_type)
-: ShaderNode(node_type)
+: BsdfBaseNode(node_type)
 {
-	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
 }
 
 void BsdfNode::compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3, ShaderInput *param4)
@@ -1854,7 +1858,7 @@ NODE_DEFINE(AnisotropicBsdfNode)
 
 	SOCKET_IN_VECTOR(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT);
 
-	SOCKET_IN_FLOAT(roughness, "Roughness", 0.2f);
+	SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
 	SOCKET_IN_FLOAT(anisotropy, "Anisotropy", 0.5f);
 	SOCKET_IN_FLOAT(rotation, "Rotation", 0.0f);
 
@@ -1914,7 +1918,7 @@ NODE_DEFINE(GlossyBsdfNode)
 	distribution_enum.insert("ashikhmin_shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID);
 	distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID);
 	SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_GGX_ID);
-	SOCKET_IN_FLOAT(roughness, "Roughness", 0.2f);
+	SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
 
 	SOCKET_OUT_CLOSURE(BSDF, "BSDF");
 
@@ -1931,21 +1935,38 @@ GlossyBsdfNode::GlossyBsdfNode()
 void GlossyBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glossy BSDF.";
 			distribution = CLOSURE_BSDF_REFLECTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFLECTION_ID)
+		{
+			VLOG(1) << "Using GGX glossy with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -1953,7 +1974,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene)
 bool GlossyBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f);
 }
 
 void GlossyBsdfNode::compile(SVMCompiler& compiler)
@@ -2008,21 +2030,38 @@ GlassBsdfNode::GlassBsdfNode()
 void GlassBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glass BSDF.";
 			distribution = CLOSURE_BSDF_SHARP_GLASS_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_SHARP_GLASS_ID)
+		{
+			VLOG(1) << "Using GGX glass with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2030,7 +2069,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene)
 bool GlassBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f);
 }
 
 void GlassBsdfNode::compile(SVMCompiler& compiler)
@@ -2085,21 +2125,38 @@ RefractionBsdfNode::RefractionBsdfNode()
 void RefractionBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp refraction BSDF.";
 			distribution = CLOSURE_BSDF_REFRACTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFRACTION_ID)
+		{
+			VLOG(1) << "Using GGX refraction with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2107,7 +2164,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene)
 bool RefractionBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f);
 }
 
 void RefractionBsdfNode::compile(SVMCompiler& compiler)
@@ -2231,6 +2289,160 @@ void DiffuseBsdfNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_diffuse_bsdf");
 }
 
+/* Disney principled BSDF Closure */
+NODE_DEFINE(PrincipledBsdfNode)
+{
+	NodeType* type = NodeType::add("principled_bsdf", create, NodeType::SHADER);
+
+	static NodeEnum distribution_enum;
+	distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID);
+	distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+	SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+
+	static NodeEnum subsurface_method_enum;
+	subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID);
+	subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+	SOCKET_ENUM(subsurface_method, "Subsurface Method", subsurface_method_enum, CLOSURE_BSSRDF_PRINCIPLED_ID);
+
+	SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
+	SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
+	SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
+	SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
+	SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+	SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
+	SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
+	SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
+	SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f);
+	SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f);
+	SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f);
+	SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f);
+	SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f);
+	SOCKET_IN_FLOAT(ior, "IOR", 0.0f);
+	SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
+	SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
+	SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f);
+	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+	SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+	SOCKET_IN_NORMAL(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT);
+	SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
+
+	SOCKET_OUT_CLOSURE(BSDF, "BSDF");
+
+	return type;
+}
+
+PrincipledBsdfNode::PrincipledBsdfNode()
+	: BsdfBaseNode(node_type)
+{
+	closure = CLOSURE_BSDF_PRINCIPLED_ID;
+	distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+	distribution_orig = NBUILTIN_CLOSURES;
+}
+
+bool PrincipledBsdfNode::has_surface_bssrdf()
+{
+	ShaderInput *subsurface_in = input("Subsurface");
+	return (subsurface_in->link != NULL || subsurface > CLOSURE_WEIGHT_CUTOFF);
+}
+
+void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+	if(shader->has_surface) {
+		ShaderInput *tangent_in = input("Tangent");
+
+		if(!tangent_in->link)
+			attributes->add(ATTR_STD_GENERATED);
+	}
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius,
+	ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic,
+	ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness,
+	ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness)
+{
+	ShaderInput *base_color_in = input("Base Color");
+	ShaderInput *subsurface_color_in = input("Subsurface Color");
+	ShaderInput *normal_in = input("Normal");
+	ShaderInput *clearcoat_normal_in = input("Clearcoat Normal");
+	ShaderInput *tangent_in = input("Tangent");
+
+	float3 weight = make_float3(1.0f, 1.0f, 1.0f);
+
+	compiler.add_node(NODE_CLOSURE_SET_WEIGHT, weight);
+
+	int normal_offset = compiler.stack_assign_if_linked(normal_in);
+	int clearcoat_normal_offset = compiler.stack_assign_if_linked(clearcoat_normal_in);
+	int tangent_offset = compiler.stack_assign_if_linked(tangent_in);
+	int specular_offset = compiler.stack_assign(p_specular);
+	int roughness_offset = compiler.stack_assign(p_roughness);
+	int specular_tint_offset = compiler.stack_assign(p_specular_tint);
+	int anisotropic_offset = compiler.stack_assign(p_anisotropic);
+	int sheen_offset = compiler.stack_assign(p_sheen);
+	int sheen_tint_offset = compiler.stack_assign(p_sheen_tint);
+	int clearcoat_offset = compiler.stack_assign(p_clearcoat);
+	int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness);
+	int ior_offset = compiler.stack_assign(p_ior);
+	int transmission_offset = compiler.stack_assign(p_transmission);
+	int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
+	int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
+	int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+
+	compiler.add_node(NODE_CLOSURE_BSDF,
+		compiler.encode_uchar4(closure,
+		compiler.stack_assign(p_metallic),
+		compiler.stack_assign(p_subsurface),
+		compiler.closure_mix_weight_offset()),
+		__float_as_int((p_metallic) ? get_float(p_metallic->socket_type) : 0.0f),
+		__float_as_int((p_subsurface) ? get_float(p_subsurface->socket_type) : 0.0f));
+
+	compiler.add_node(normal_offset, tangent_offset,
+		compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset),
+		compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset));
+
+	compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset),
+		distribution, subsurface_method, SVM_STACK_INVALID);
+
+	float3 bc_default = get_float3(base_color_in->socket_type);
+
+	compiler.add_node(((base_color_in->link) ? compiler.stack_assign(base_color_in) : SVM_STACK_INVALID),
+		__float_as_int(bc_default.x), __float_as_int(bc_default.y), __float_as_int(bc_default.z));
+
+	compiler.add_node(clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+	float3 ss_default = get_float3(subsurface_color_in->socket_type);
+
+	compiler.add_node(((subsurface_color_in->link) ? compiler.stack_assign(subsurface_color_in) : SVM_STACK_INVALID),
+		__float_as_int(ss_default.x), __float_as_int(ss_default.y), __float_as_int(ss_default.z));
+}
+
+bool PrincipledBsdfNode::has_integrator_dependency()
+{
+	ShaderInput *roughness_input = input("Roughness");
+	return !roughness_input->link && roughness <= 1e-4f;
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler)
+{
+	compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"),
+		input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"),
+		input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"),
+		input("Anisotropic Rotation"), input("Transmission Roughness"));
+}
+
+void PrincipledBsdfNode::compile(OSLCompiler& compiler)
+{
+	compiler.parameter(this, "distribution");
+	compiler.parameter(this, "subsurface_method");
+	compiler.add(this, "node_principled_bsdf");
+}
+
+bool PrincipledBsdfNode::has_bssrdf_bump()
+{
+	return has_surface_bssrdf() && has_bump();
+}
+
 /* Translucent BSDF Closure */
 
 NODE_DEFINE(TranslucentBsdfNode)
@@ -2306,6 +2518,7 @@ NODE_DEFINE(SubsurfaceScatteringNode)
 	falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID);
 	falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID);
 	falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID);
+	falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
 	SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID);
 	SOCKET_IN_FLOAT(scale, "Scale", 0.01f);
 	SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f));
@@ -2617,6 +2830,120 @@ void ScatterVolumeNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_scatter_volume");
 }
 
+/* Principled Volume Closure */
+
+NODE_DEFINE(PrincipledVolumeNode)
+{
+	NodeType* type = NodeType::add("principled_volume", create, NodeType::SHADER);
+
+	SOCKET_IN_STRING(density_attribute, "Density Attribute", ustring());
+	SOCKET_IN_STRING(color_attribute, "Color Attribute", ustring());
+	SOCKET_IN_STRING(temperature_attribute, "Temperature Attribute", ustring());
+
+	SOCKET_IN_COLOR(color, "Color", make_float3(0.5f, 0.5f, 0.5f));
+	SOCKET_IN_FLOAT(density, "Density", 1.0f);
+	SOCKET_IN_FLOAT(anisotropy, "Anisotropy", 0.0f);
+	SOCKET_IN_COLOR(absorption_color, "Absorption Color", make_float3(0.0f, 0.0f, 0.0f));
+	SOCKET_IN_FLOAT(emission_strength, "Emission Strength", 0.0f);
+	SOCKET_IN_COLOR(emission_color, "Emission Color", make_float3(1.0f, 1.0f, 1.0f));
+	SOCKET_IN_FLOAT(blackbody_intensity, "Blackbody Intensity", 0.0f);
+	SOCKET_IN_COLOR(blackbody_tint, "Blackbody Tint", make_float3(1.0f, 1.0f, 1.0f));
+	SOCKET_IN_FLOAT(temperature, "Temperature", 1500.0f);
+	SOCKET_IN_FLOAT(volume_mix_weight, "VolumeMixWeight", 0.0f, SocketType::SVM_INTERNAL);
+
+	SOCKET_OUT_CLOSURE(volume, "Volume");
+
+	return type;
+}
+
+PrincipledVolumeNode::PrincipledVolumeNode()
+: VolumeNode(node_type)
+{
+	closure = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID;
+}
+
+void PrincipledVolumeNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+	if(shader->has_volume) {
+		ShaderInput *density_in = input("Density");
+		ShaderInput *blackbody_in = input("Blackbody Intensity");
+
+		if(density_in->link || density > 0.0f) {
+			attributes->add_standard(density_attribute);
+			attributes->add_standard(color_attribute);
+		}
+
+		if(blackbody_in->link || blackbody_intensity > 0.0f) {
+			attributes->add_standard(temperature_attribute);
+		}
+
+		attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+	}
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void PrincipledVolumeNode::compile(SVMCompiler& compiler)
+{
+	ShaderInput *color_in = input("Color");
+	ShaderInput *density_in = input("Density");
+	ShaderInput *anisotropy_in = input("Anisotropy");
+	ShaderInput *absorption_color_in = input("Absorption Color");
+	ShaderInput *emission_in = input("Emission Strength");
+	ShaderInput *emission_color_in = input("Emission Color");
+	ShaderInput *blackbody_in = input("Blackbody Intensity");
+	ShaderInput *blackbody_tint_in = input("Blackbody Tint");
+	ShaderInput *temperature_in = input("Temperature");
+
+	if(color_in->link)
+		compiler.add_node(NODE_CLOSURE_WEIGHT, compiler.stack_assign(color_in));
+	else
+		compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color);
+
+	compiler.add_node(NODE_PRINCIPLED_VOLUME,
+		compiler.encode_uchar4(
+			compiler.stack_assign_if_linked(density_in),
+			compiler.stack_assign_if_linked(anisotropy_in),
+			compiler.stack_assign(absorption_color_in),
+			compiler.closure_mix_weight_offset()),
+		compiler.encode_uchar4(
+			compiler.stack_assign_if_linked(emission_in),
+			compiler.stack_assign(emission_color_in),
+			compiler.stack_assign_if_linked(blackbody_in),
+			compiler.stack_assign(temperature_in)),
+		compiler.stack_assign(blackbody_tint_in));
+
+	int attr_density = compiler.attribute_standard(density_attribute);
+	int attr_color = compiler.attribute_standard(color_attribute);
+	int attr_temperature = compiler.attribute_standard(temperature_attribute);
+
+	compiler.add_node(
+		__float_as_int(density),
+		__float_as_int(anisotropy),
+		__float_as_int(emission_strength),
+		__float_as_int(blackbody_intensity));
+
+	compiler.add_node(
+		attr_density,
+		attr_color,
+		attr_temperature);
+}
+
+void PrincipledVolumeNode::compile(OSLCompiler& compiler)
+{
+	if(Attribute::name_standard(density_attribute.c_str())) {
+		density_attribute = ustring("geom:" + density_attribute.string());
+	}
+	if(Attribute::name_standard(color_attribute.c_str())) {
+		color_attribute = ustring("geom:" + color_attribute.string());
+	}
+	if(Attribute::name_standard(temperature_attribute.c_str())) {
+		temperature_attribute = ustring("geom:" + temperature_attribute.string());
+	}
+
+	compiler.add(this, "node_principled_volume");
+}
+
 /* Hair BSDF Closure */
 
 NODE_DEFINE(HairBsdfNode)
@@ -2887,7 +3214,6 @@ void TextureCoordinateNode::compile(SVMCompiler& compiler)
 			compiler.add_node(ob_itfm.x);
 			compiler.add_node(ob_itfm.y);
 			compiler.add_node(ob_itfm.z);
-			compiler.add_node(ob_itfm.w);
 		}
 	}
 
@@ -2926,7 +3252,7 @@ void TextureCoordinateNode::compile(OSLCompiler& compiler)
 	if(compiler.output_type() == SHADER_TYPE_VOLUME)
 		compiler.parameter("is_volume", true);
 	compiler.parameter(this, "use_transform");
-	Transform ob_itfm = transform_transpose(transform_inverse(ob_tfm));
+	Transform ob_itfm = transform_transposed_inverse(ob_tfm);
 	compiler.parameter("object_itfm", ob_itfm);
 
 	compiler.parameter(this, "from_dupli");
@@ -2940,7 +3266,7 @@ NODE_DEFINE(UVMapNode)
 {
 	NodeType* type = NodeType::add("uvmap", create, NodeType::SHADER);
 
-	SOCKET_IN_STRING(attribute, "attribute", ustring(""));
+	SOCKET_STRING(attribute, "attribute", ustring());
 	SOCKET_IN_BOOLEAN(from_dupli, "from dupli", false);
 
 	SOCKET_OUT_POINT(UV, "UV");
@@ -3237,6 +3563,7 @@ NODE_DEFINE(ParticleInfoNode)
 	NodeType* type = NodeType::add("particle_info", create, NodeType::SHADER);
 
 	SOCKET_OUT_FLOAT(index, "Index");
+	SOCKET_OUT_FLOAT(random, "Random");
 	SOCKET_OUT_FLOAT(age, "Age");
 	SOCKET_OUT_FLOAT(lifetime, "Lifetime");
 	SOCKET_OUT_POINT(location, "Location");
@@ -3259,6 +3586,8 @@ void ParticleInfoNode::attributes(Shader *shader, AttributeRequestSet *attribute
 {
 	if(!output("Index")->links.empty())
 		attributes->add(ATTR_STD_PARTICLE);
+	if(!output("Random")->links.empty())
+		attributes->add(ATTR_STD_PARTICLE);
 	if(!output("Age")->links.empty())
 		attributes->add(ATTR_STD_PARTICLE);
 	if(!output("Lifetime")->links.empty())
@@ -3287,6 +3616,11 @@ void ParticleInfoNode::compile(SVMCompiler& compiler)
 	if(!out->links.empty()) {
 		compiler.add_node(NODE_PARTICLE_INFO, NODE_INFO_PAR_INDEX, compiler.stack_assign(out));
 	}
+
+	out = output("Random");
+	if(!out->links.empty()) {
+		compiler.add_node(NODE_PARTICLE_INFO, NODE_INFO_PAR_RANDOM, compiler.stack_assign(out));
+	}
 	
 	out = output("Age");
 	if(!out->links.empty()) {
@@ -3341,10 +3675,11 @@ NODE_DEFINE(HairInfoNode)
 	SOCKET_OUT_FLOAT(is_strand, "Is Strand");
 	SOCKET_OUT_FLOAT(intercept, "Intercept");
 	SOCKET_OUT_FLOAT(thickness, "Thickness");
-	SOCKET_OUT_NORMAL(tangent Normal, "Tangent Normal");
+	SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal");
 #if 0 /*output for minimum hair width transparency - deactivated */
 	SOCKET_OUT_FLOAT(fade, "Fade");
 #endif
+	SOCKET_OUT_FLOAT(index, "Random");
 
 	return type;
 }
@@ -3361,6 +3696,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 
 		if(!intercept_out->links.empty())
 			attributes->add(ATTR_STD_CURVE_INTERCEPT);
+
+		if(!output("Random")->links.empty())
+			attributes->add(ATTR_STD_CURVE_RANDOM);
 	}
 
 	ShaderNode::attributes(shader, attributes);
@@ -3396,6 +3734,11 @@ void HairInfoNode::compile(SVMCompiler& compiler)
 		compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_FADE, compiler.stack_assign(out));
 	}*/
 
+	out = output("Random");
+	if(!out->links.empty()) {
+		int attr = compiler.attribute(ATTR_STD_CURVE_RANDOM);
+		compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_FLOAT);
+	}
 }
 
 void HairInfoNode::compile(OSLCompiler& compiler)
@@ -4221,7 +4564,7 @@ NODE_DEFINE(AttributeNode)
 {
 	NodeType* type = NodeType::add("attribute", create, NodeType::SHADER);
 
-	SOCKET_STRING(attribute, "Attribute", ustring(""));
+	SOCKET_STRING(attribute, "Attribute", ustring());
 
 	SOCKET_OUT_COLOR(color, "Color");
 	SOCKET_OUT_VECTOR(vector, "Vector");
@@ -4242,16 +4585,12 @@ void AttributeNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 	ShaderOutput *fac_out = output("Fac");
 
 	if(!color_out->links.empty() || !vector_out->links.empty() || !fac_out->links.empty()) {
-		AttributeStandard std = Attribute::name_standard(attribute.c_str());
-
-		if(std != ATTR_STD_NONE)
-			attributes->add(std);
-		else
-			attributes->add(attribute);
+		attributes->add_standard(attribute);
 	}
 
-	if(shader->has_volume)
+	if(shader->has_volume) {
 		attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+	}
 
 	ShaderNode::attributes(shader, attributes);
 }
@@ -4262,13 +4601,7 @@ void AttributeNode::compile(SVMCompiler& compiler)
 	ShaderOutput *vector_out = output("Vector");
 	ShaderOutput *fac_out = output("Fac");
 	ShaderNodeType attr_node = NODE_ATTR;
-	AttributeStandard std = Attribute::name_standard(attribute.c_str());
-	int attr;
-
-	if(std != ATTR_STD_NONE)
-		attr = compiler.attribute(std);
-	else
-		attr = compiler.attribute(attribute);
+	int attr = compiler.attribute_standard(attribute);
 
 	if(bump == SHADER_BUMP_DX)
 		attr_node = NODE_ATTR_BUMP_DX;
@@ -4561,7 +4894,7 @@ NODE_DEFINE(OutputNode)
 
 	SOCKET_IN_CLOSURE(surface, "Surface");
 	SOCKET_IN_CLOSURE(volume, "Volume");
-	SOCKET_IN_FLOAT(displacement, "Displacement", 0.0f);
+	SOCKET_IN_VECTOR(displacement, "Displacement", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f));
 
 	return type;
@@ -5226,7 +5559,7 @@ NODE_DEFINE(NormalMapNode)
 	space_enum.insert("blender_world", NODE_NORMAL_MAP_BLENDER_WORLD);
 	SOCKET_ENUM(space, "Space", space_enum, NODE_TANGENT_RADIAL);
 
-	SOCKET_STRING(attribute, "Attribute", ustring(""));
+	SOCKET_STRING(attribute, "Attribute", ustring());
 
 	SOCKET_IN_NORMAL(normal_osl, "NormalIn", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
 	SOCKET_IN_FLOAT(strength, "Strength", 1.0f);
@@ -5245,7 +5578,7 @@ NormalMapNode::NormalMapNode()
 void NormalMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 	if(shader->has_surface && space == NODE_NORMAL_MAP_TANGENT) {
-		if(attribute == ustring("")) {
+		if(attribute.empty()) {
 			attributes->add(ATTR_STD_UV_TANGENT);
 			attributes->add(ATTR_STD_UV_TANGENT_SIGN);
 		}
@@ -5268,7 +5601,7 @@ void NormalMapNode::compile(SVMCompiler& compiler)
 	int attr = 0, attr_sign = 0;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
-		if(attribute == ustring("")) {
+		if(attribute.empty()) {
 			attr = compiler.attribute(ATTR_STD_UV_TANGENT);
 			attr_sign = compiler.attribute(ATTR_STD_UV_TANGENT_SIGN);
 		}
@@ -5290,7 +5623,7 @@ void NormalMapNode::compile(SVMCompiler& compiler)
 void NormalMapNode::compile(OSLCompiler& compiler)
 {
 	if(space == NODE_NORMAL_MAP_TANGENT) {
-		if(attribute == ustring("")) {
+		if(attribute.empty()) {
 			compiler.parameter("attr_name", ustring("geom:tangent"));
 			compiler.parameter("attr_sign_name", ustring("geom:tangent_sign"));
 		}
@@ -5321,7 +5654,7 @@ NODE_DEFINE(TangentNode)
 	axis_enum.insert("z", NODE_TANGENT_AXIS_Z);
 	SOCKET_ENUM(axis, "Axis", axis_enum, NODE_TANGENT_AXIS_X);
 
-	SOCKET_STRING(attribute, "Attribute", ustring(""));
+	SOCKET_STRING(attribute, "Attribute", ustring());
 
 	SOCKET_IN_NORMAL(normal_osl, "NormalIn", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
 	SOCKET_OUT_NORMAL(tangent, "Tangent");
@@ -5338,7 +5671,7 @@ void TangentNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 	if(shader->has_surface) {
 		if(direction_type == NODE_TANGENT_UVMAP) {
-			if(attribute == ustring(""))
+			if(attribute.empty())
 				attributes->add(ATTR_STD_UV_TANGENT);
 			else
 				attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
@@ -5356,7 +5689,7 @@ void TangentNode::compile(SVMCompiler& compiler)
 	int attr;
 
 	if(direction_type == NODE_TANGENT_UVMAP) {
-		if(attribute == ustring(""))
+		if(attribute.empty())
 			attr = compiler.attribute(ATTR_STD_UV_TANGENT);
 		else
 			attr = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent").c_str()));
@@ -5374,7 +5707,7 @@ void TangentNode::compile(SVMCompiler& compiler)
 void TangentNode::compile(OSLCompiler& compiler)
 {
 	if(direction_type == NODE_TANGENT_UVMAP) {
-		if(attribute == ustring(""))
+		if(attribute.empty())
 			compiler.parameter("attr_name", ustring("geom:tangent"));
 		else
 			compiler.parameter("attr_name", ustring((string(attribute.c_str()) + ".tangent").c_str()));
@@ -5385,4 +5718,205 @@ void TangentNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_tangent"); 
 }
 
+/* Bevel */
+
+NODE_DEFINE(BevelNode)
+{
+	NodeType* type = NodeType::add("bevel", create, NodeType::SHADER);
+
+	SOCKET_INT(samples, "Samples", 4);
+
+	SOCKET_IN_FLOAT(radius, "Radius", 0.05f);
+	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+
+	SOCKET_OUT_NORMAL(bevel, "Normal");
+
+	return type;
+}
+
+BevelNode::BevelNode()
+: ShaderNode(node_type)
+{
+}
+
+void BevelNode::compile(SVMCompiler& compiler)
+{
+	ShaderInput *radius_in = input("Radius");
+	ShaderInput *normal_in = input("Normal");
+	ShaderOutput *normal_out = output("Normal");
+
+	compiler.add_node(NODE_BEVEL,
+		compiler.encode_uchar4(samples,
+		                       compiler.stack_assign(radius_in),
+		                       compiler.stack_assign_if_linked(normal_in),
+		                       compiler.stack_assign(normal_out)));
+}
+
+void BevelNode::compile(OSLCompiler& compiler)
+{
+	compiler.parameter(this, "samples");
+	compiler.add(this, "node_bevel");
+}
+
+/* Displacement */
+
+NODE_DEFINE(DisplacementNode)
+{
+	NodeType* type = NodeType::add("displacement", create, NodeType::SHADER);
+
+	static NodeEnum space_enum;
+	space_enum.insert("object", NODE_NORMAL_MAP_OBJECT);
+	space_enum.insert("world", NODE_NORMAL_MAP_WORLD);
+
+	SOCKET_ENUM(space, "Space", space_enum, NODE_NORMAL_MAP_TANGENT);
+
+	SOCKET_IN_FLOAT(height, "Height", 0.0f);
+	SOCKET_IN_FLOAT(midlevel, "Midlevel", 0.5f);
+	SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
+	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+
+	SOCKET_OUT_VECTOR(displacement, "Displacement");
+
+	return type;
+}
+
+DisplacementNode::DisplacementNode()
+: ShaderNode(node_type)
+{
+}
+
+void DisplacementNode::constant_fold(const ConstantFolder& folder)
+{
+	if(folder.all_inputs_constant()) {
+		if((height - midlevel == 0.0f) || (scale == 0.0f)) {
+			folder.make_zero();
+		}
+	}
+}
+
+void DisplacementNode::compile(SVMCompiler& compiler)
+{
+	ShaderInput *height_in = input("Height");
+	ShaderInput *midlevel_in = input("Midlevel");
+	ShaderInput *scale_in = input("Scale");
+	ShaderInput *normal_in = input("Normal");
+	ShaderOutput *displacement_out = output("Displacement");
+
+	compiler.add_node(NODE_DISPLACEMENT,
+		compiler.encode_uchar4(compiler.stack_assign(height_in),
+		                       compiler.stack_assign(midlevel_in),
+		                       compiler.stack_assign(scale_in),
+		                       compiler.stack_assign_if_linked(normal_in)),
+	    compiler.stack_assign(displacement_out),
+		space);
+}
+
+void DisplacementNode::compile(OSLCompiler& compiler)
+{
+	compiler.parameter(this, "space");
+	compiler.add(this, "node_displacement");
+}
+
+/* Vector Displacement */
+
+NODE_DEFINE(VectorDisplacementNode)
+{
+	NodeType* type = NodeType::add("vector_displacement", create, NodeType::SHADER);
+
+	static NodeEnum space_enum;
+	space_enum.insert("tangent", NODE_NORMAL_MAP_TANGENT);
+	space_enum.insert("object", NODE_NORMAL_MAP_OBJECT);
+	space_enum.insert("world", NODE_NORMAL_MAP_WORLD);
+
+	SOCKET_ENUM(space, "Space", space_enum, NODE_NORMAL_MAP_TANGENT);
+	SOCKET_STRING(attribute, "Attribute", ustring());
+
+	SOCKET_IN_COLOR(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f));
+	SOCKET_IN_FLOAT(midlevel, "Midlevel", 0.0f);
+	SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
+
+	SOCKET_OUT_VECTOR(displacement, "Displacement");
+
+	return type;
+}
+
+VectorDisplacementNode::VectorDisplacementNode()
+: ShaderNode(node_type)
+{
+}
+
+void VectorDisplacementNode::constant_fold(const ConstantFolder& folder)
+{
+	if(folder.all_inputs_constant()) {
+		if((vector == make_float3(0.0f, 0.0f, 0.0f) && midlevel == 0.0f) ||
+		   (scale == 0.0f)) {
+			folder.make_zero();
+		}
+	}
+}
+
+void VectorDisplacementNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+	if(shader->has_surface && space == NODE_NORMAL_MAP_TANGENT) {
+		if(attribute.empty()) {
+			attributes->add(ATTR_STD_UV_TANGENT);
+			attributes->add(ATTR_STD_UV_TANGENT_SIGN);
+		}
+		else {
+			attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
+			attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
+		}
+
+		attributes->add(ATTR_STD_VERTEX_NORMAL);
+	}
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void VectorDisplacementNode::compile(SVMCompiler& compiler)
+{
+	ShaderInput *vector_in = input("Vector");
+	ShaderInput *midlevel_in = input("Midlevel");
+	ShaderInput *scale_in = input("Scale");
+	ShaderOutput *displacement_out = output("Displacement");
+	int attr = 0, attr_sign = 0;
+
+	if(space == NODE_NORMAL_MAP_TANGENT) {
+		if(attribute.empty()) {
+			attr = compiler.attribute(ATTR_STD_UV_TANGENT);
+			attr_sign = compiler.attribute(ATTR_STD_UV_TANGENT_SIGN);
+		}
+		else {
+			attr = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent").c_str()));
+			attr_sign = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
+		}
+	}
+
+	compiler.add_node(NODE_VECTOR_DISPLACEMENT,
+		compiler.encode_uchar4(compiler.stack_assign(vector_in),
+		                       compiler.stack_assign(midlevel_in),
+		                       compiler.stack_assign(scale_in),
+		                       compiler.stack_assign(displacement_out)),
+		attr, attr_sign);
+
+	compiler.add_node(space);
+}
+
+void VectorDisplacementNode::compile(OSLCompiler& compiler)
+{
+	if(space == NODE_NORMAL_MAP_TANGENT) {
+		if(attribute.empty()) {
+			compiler.parameter("attr_name", ustring("geom:tangent"));
+			compiler.parameter("attr_sign_name", ustring("geom:tangent_sign"));
+		}
+		else {
+			compiler.parameter("attr_name", ustring((string(attribute.c_str()) + ".tangent").c_str()));
+			compiler.parameter("attr_sign_name", ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
+		}
+	}
+
+	compiler.parameter(this, "space");
+	compiler.add(this, "node_vector_displacement");
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index eb0f7977dd1..58c3d472cd3 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -17,10 +17,10 @@
 #ifndef __NODES_H__
 #define __NODES_H__
 
-#include "graph.h"
-#include "node.h"
+#include "render/graph.h"
+#include "graph/node.h"
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,6 +82,7 @@ public:
 	~ImageTextureNode();
 	ShaderNode *clone() const;
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 
 	ImageManager *image_manager;
 	int is_float;
@@ -112,6 +113,7 @@ public:
 	~EnvironmentTextureNode();
 	ShaderNode *clone() const;
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	ImageManager *image_manager;
@@ -154,7 +156,7 @@ public:
 
 	void *surface;
 	void *volume;
-	float displacement;
+	float3 displacement;
 	float3 normal;
 
 	/* Don't allow output node de-duplication. */
@@ -252,10 +254,12 @@ public:
 class PointDensityTextureNode : public ShaderNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	~PointDensityTextureNode();
 	ShaderNode *clone() const;
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 
 	bool has_spatial_varying() { return true; }
 	bool has_object_dependency() { return true; }
@@ -321,25 +325,33 @@ private:
 	static bool initialized;
 };
 
-class BsdfNode : public ShaderNode {
+class BsdfBaseNode : public ShaderNode {
 public:
-	explicit BsdfNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(BsdfNode);
+	BsdfBaseNode(const NodeType *node_type);
 
 	bool has_spatial_varying() { return true; }
-	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
 	virtual ClosureType get_closure_type() { return closure; }
-
-	float3 color;
-	float3 normal;
-	float surface_mix_weight;
-	ClosureType closure;
+	virtual bool has_bump();
 
 	virtual bool equals(const ShaderNode& /*other*/)
 	{
 		/* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
 		return false;
 	}
+
+	ClosureType closure;
+};
+
+class BsdfNode : public BsdfBaseNode {
+public:
+	explicit BsdfNode(const NodeType *node_type);
+	SHADER_NODE_BASE_CLASS(BsdfNode)
+
+	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
+
+	float3 color;
+	float3 normal;
+	float surface_mix_weight;
 };
 
 class AnisotropicBsdfNode : public BsdfNode {
@@ -352,6 +364,7 @@ public:
 
 	ClosureType get_closure_type() { return distribution; }
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 };
 
 class DiffuseBsdfNode : public BsdfNode {
@@ -361,6 +374,33 @@ public:
 	float roughness;
 };
 
+/* Disney principled BRDF */
+class PrincipledBsdfNode : public BsdfBaseNode {
+public:
+	SHADER_NODE_CLASS(PrincipledBsdfNode)
+
+	bool has_surface_bssrdf();
+	bool has_bssrdf_bump();
+	void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
+		ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic,
+		ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness,
+		ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness);
+
+	float3 base_color;
+	float3 subsurface_color, subsurface_radius;
+	float metallic, subsurface, specular, roughness, specular_tint, anisotropic,
+		sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission,
+		anisotropic_rotation, transmission_roughness;
+	float3 normal, clearcoat_normal, tangent;
+	float surface_mix_weight;
+	ClosureType distribution, distribution_orig;
+	ClosureType subsurface_method;
+
+	bool has_integrator_dependency();
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
+};
+
 class TranslucentBsdfNode : public BsdfNode {
 public:
 	SHADER_NODE_CLASS(TranslucentBsdfNode)
@@ -388,7 +428,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness;
+	float roughness, roughness_orig;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -400,7 +440,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -412,7 +452,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -442,9 +482,9 @@ class EmissionNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(EmissionNode)
 	void constant_fold(const ConstantFolder& folder);
-	virtual ClosureType get_closure_type() { return CLOSURE_EMISSION_ID; }
 
 	bool has_surface_emission() { return true; }
+	bool has_volume_support() { return true; }
 
 	float3 color;
 	float strength;
@@ -455,7 +495,6 @@ class BackgroundNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BackgroundNode)
 	void constant_fold(const ConstantFolder& folder);
-	virtual ClosureType get_closure_type() { return CLOSURE_BACKGROUND_ID; }
 
 	float3 color;
 	float strength;
@@ -496,6 +535,7 @@ public:
 		return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
 	}
 	virtual ClosureType get_closure_type() { return closure; }
+	virtual bool has_volume_support() { return true; }
 
 	float3 color;
 	float density;
@@ -521,6 +561,25 @@ public:
 	float anisotropy;
 };
 
+class PrincipledVolumeNode : public VolumeNode {
+public:
+	SHADER_NODE_CLASS(PrincipledVolumeNode)
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
+
+	ustring density_attribute;
+	ustring color_attribute;
+	ustring temperature_attribute;
+
+	float anisotropy;
+	float3 absorption_color;
+	float emission_strength;
+	float3 emission_color;
+	float blackbody_intensity;
+	float3 blackbody_tint;
+	float temperature;
+};
+
 class HairBsdfNode : public BsdfNode {
 public:
 	SHADER_NODE_CLASS(HairBsdfNode)
@@ -537,6 +596,7 @@ class GeometryNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(GeometryNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 
 	float3 normal_osl;
@@ -546,6 +606,7 @@ class TextureCoordinateNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(TextureCoordinateNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 	bool has_object_dependency() { return use_transform; }
 
@@ -559,6 +620,7 @@ class UVMapNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(UVMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
@@ -592,6 +654,7 @@ class ParticleInfoNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(ParticleInfoNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
@@ -600,6 +663,7 @@ public:
 	SHADER_NODE_CLASS(HairInfoNode)
 
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 	virtual int get_feature() {
@@ -641,7 +705,7 @@ public:
 
 class MixClosureWeightNode : public ShaderNode {
 public:
-	SHADER_NODE_CLASS(MixClosureWeightNode);
+	SHADER_NODE_CLASS(MixClosureWeightNode)
 
 	float weight;
 	float fac;
@@ -761,6 +825,7 @@ class AttributeNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(AttributeNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 
 	ustring attribute;
@@ -887,7 +952,7 @@ public:
 class CurvesNode : public ShaderNode {
 public:
 	explicit CurvesNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(CurvesNode);
+	SHADER_NODE_BASE_CLASS(CurvesNode)
 
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
@@ -946,6 +1011,8 @@ public:
 
 	/* ideally we could beter detect this, but we can't query this now */
 	bool has_spatial_varying() { return true; }
+	bool has_volume_support() { return true; }
+
 	virtual bool equals(const ShaderNode& /*other*/) { return false; }
 
 	string filepath;
@@ -956,6 +1023,7 @@ class NormalMapNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(NormalMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
@@ -970,6 +1038,7 @@ class TangentNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(TangentNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
@@ -979,6 +1048,50 @@ public:
 	float3 normal_osl;
 };
 
+class BevelNode : public ShaderNode {
+public:
+	SHADER_NODE_CLASS(BevelNode)
+	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+	virtual bool has_raytrace() { return true; }
+
+	float radius;
+	float3 normal;
+	int samples;
+};
+
+class DisplacementNode : public ShaderNode {
+public:
+	SHADER_NODE_CLASS(DisplacementNode)
+	void constant_fold(const ConstantFolder& folder);
+	virtual int get_feature() {
+		return NODE_FEATURE_BUMP;
+	}
+
+	NodeNormalMapSpace space;
+	float height;
+	float midlevel;
+	float scale;
+	float3 normal;
+};
+
+class VectorDisplacementNode : public ShaderNode {
+public:
+	SHADER_NODE_CLASS(VectorDisplacementNode)
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_attribute_dependency() { return true; }
+	void constant_fold(const ConstantFolder& folder);
+	virtual int get_feature() {
+		return NODE_FEATURE_BUMP;
+	}
+
+	NodeNormalMapSpace space;
+	ustring attribute;
+	float3 vector;
+	float midlevel;
+	float scale;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __NODES_H__ */
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 8342f376836..4d64d841206 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,25 +14,71 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "light.h"
-#include "mesh.h"
-#include "curves.h"
-#include "object.h"
-#include "particles.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
-
-#include "subd_patch_table.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/curves.h"
+#include "render/object.h"
+#include "render/particles.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
+
+#include "subd/subd_patch_table.h"
 
 CCL_NAMESPACE_BEGIN
 
+/* Global state of object transform update. */
+
+struct UpdateObjectTransformState {
+	/* Global state used by device_update_object_transform().
+	 * Common for both threaded and non-threaded update.
+	 */
+
+	/* Type of the motion required by the scene settings. */
+	Scene::MotionType need_motion;
+
+	/* Mapping from particle system to a index in packed particle array.
+	 * Only used for read.
+	 */
+	map<ParticleSystem*, int> particle_offset;
+
+	/* Mesh area.
+	 * Used to avoid calculation of mesh area multiple times. Used for both
+	 * read and write. Acquire surface_area_lock to keep it all thread safe.
+	 */
+	map<Mesh*, float> surface_area_map;
+
+	/* Motion offsets for each object. */
+	array<uint> motion_offset;
+
+	/* Packed object arrays. Those will be filled in. */
+	uint *object_flag;
+	KernelObject *objects;
+	Transform *object_motion_pass;
+	DecomposedTransform *object_motion;
+
+	/* Flags which will be synchronized to Integrator. */
+	bool have_motion;
+	bool have_curves;
+
+	/* ** Scheduling queue. ** */
+
+	Scene *scene;
+
+	/* Some locks to keep everything thread-safe. */
+	thread_spin_lock queue_lock;
+	thread_spin_lock surface_area_lock;
+
+	/* First unused object index in the queue. */
+	int queue_start_object;
+};
+
 /* Object */
 
 NODE_DEFINE(Object)
@@ -48,6 +94,9 @@ NODE_DEFINE(Object)
 	SOCKET_BOOLEAN(hide_on_missing_motion, "Hide on Missing Motion", false);
 	SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f));
+	SOCKET_TRANSFORM_ARRAY(motion, "Motion", array<Transform>());
+
+	SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false);
 
 	return type;
 }
@@ -58,45 +107,54 @@ Object::Object()
 	particle_system = NULL;
 	particle_index = 0;
 	bounds = BoundBox::empty;
-	motion.pre = transform_empty();
-	motion.mid = transform_empty();
-	motion.post = transform_empty();
-	use_motion = false;
 }
 
 Object::~Object()
 {
 }
 
-void Object::compute_bounds(bool motion_blur)
+void Object::update_motion()
 {
-	BoundBox mbounds = mesh->bounds;
-
-	if(motion_blur && use_motion) {
-		MotionTransform mtfm = motion;
+	if(!use_motion()) {
+		return;
+	}
 
-		if(hide_on_missing_motion) {
-			/* Hide objects that have no valid previous or next transform, for
-			 * example particle that stop existing. TODO: add support for this
-			 * case in the kernel so we don't get render artifacts. */
-			if(mtfm.pre == transform_empty() ||
-			   mtfm.post == transform_empty()) {
-				bounds = BoundBox::empty;
+	bool have_motion = false;
+
+	for(size_t i = 0; i < motion.size(); i++) {
+		if(motion[i] == transform_empty()) {
+			if(hide_on_missing_motion) {
+				/* Hide objects that have no valid previous or next
+				 * transform, for example particle that stop existing. It
+				 * would be better to handle this in the kernel and make
+				 * objects invisible outside certain motion steps. */
+				tfm = transform_empty();
+				motion.clear();
 				return;
 			}
+			else {
+				/* Otherwise just copy center motion. */
+				motion[i] = tfm;
+			}
 		}
 
-		/* In case of missing motion information for previous/next frame,
-		 * assume there is no motion. */
-		if(mtfm.pre == transform_empty()) {
-			mtfm.pre = tfm;
-		}
-		if(mtfm.post == transform_empty()) {
-			mtfm.post = tfm;
-		}
+		/* Test if any of the transforms are actually different. */
+		have_motion = have_motion || motion[i] != tfm;
+	}
+
+	/* Clear motion array if there is no actual motion. */
+	if(!have_motion) {
+		motion.clear();
+	}
+}
+
+void Object::compute_bounds(bool motion_blur)
+{
+	BoundBox mbounds = mesh->bounds;
 
-		DecompMotionTransform decomp;
-		transform_motion_decompose(&decomp, &mtfm, &tfm);
+	if(motion_blur && use_motion()) {
+		array<DecomposedTransform> decomp(motion.size());
+		transform_motion_decompose(decomp.data(), motion.data(), motion.size());
 
 		bounds = BoundBox::empty;
 
@@ -106,11 +164,12 @@ void Object::compute_bounds(bool motion_blur)
 		for(float t = 0.0f; t < 1.0f; t += (1.0f/128.0f)) {
 			Transform ttfm;
 
-			transform_motion_interpolate(&ttfm, &decomp, t);
+			transform_motion_array_interpolate(&ttfm, decomp.data(), motion.size(), t);
 			bounds.grow(mbounds.transformed(&ttfm));
 		}
 	}
 	else {
+		/* No motion blur case. */
 		if(mesh->transform_applied) {
 			bounds = mbounds;
 		}
@@ -130,7 +189,7 @@ void Object::apply_transform(bool apply_to_motion)
 		/* store matrix to transform later. when accessing these as attributes we
 		 * do not want the transform to be applied for consistency between static
 		 * and dynamic BVH, so we do it on packing. */
-		mesh->transform_normal = transform_transpose(transform_inverse(tfm));
+		mesh->transform_normal = transform_transposed_inverse(tfm);
 
 		/* apply to mesh vertices */
 		for(size_t i = 0; i < mesh->verts.size(); i++)
@@ -230,27 +289,30 @@ void Object::tag_update(Scene *scene)
 	scene->object_manager->need_update = true;
 }
 
-vector<float> Object::motion_times()
+bool Object::use_motion() const
 {
-	/* compute times at which we sample motion for this object */
-	vector<float> times;
-
-	if(!mesh || mesh->motion_steps == 1)
-		return times;
+	return (motion.size() > 1);
+}
 
-	int motion_steps = mesh->motion_steps;
+float Object::motion_time(int step) const
+{
+	return (use_motion()) ? 2.0f * step / (motion.size() - 1) - 1.0f : 0.0f;
+}
 
-	for(int step = 0; step < motion_steps; step++) {
-		if(step != motion_steps / 2) {
-			float time = 2.0f * step / (motion_steps - 1) - 1.0f;
-			times.push_back(time);
+int Object::motion_step(float time) const
+{
+	if(use_motion()) {
+		for(size_t step = 0; step < motion.size(); step++) {
+			if(time == motion_time(step)) {
+				return step;
+			}
 		}
 	}
 
-	return times;
+	return -1;
 }
 
-bool Object::is_traceable()
+bool Object::is_traceable() const
 {
 	/* Mesh itself can be empty,can skip all such objects. */
 	if(!bounds.valid() || bounds.size() == make_float3(0.0f, 0.0f, 0.0f)) {
@@ -260,6 +322,17 @@ bool Object::is_traceable()
 	return true;
 }
 
+uint Object::visibility_for_tracing() const {
+	uint trace_visibility = visibility;
+	if(is_shadow_catcher) {
+		trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
+	}
+	else {
+		trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
+	}
+	return trace_visibility;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -272,12 +345,12 @@ ObjectManager::~ObjectManager()
 {
 }
 
-void ObjectManager::device_update_object_transform(UpdateObejctTransformState *state,
+void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state,
                                                    Object *ob,
                                                    int object_index)
 {
-	float4 *objects = state->objects;
-	float4 *objects_vector = state->objects_vector;
+	KernelObject& kobject = state->objects[object_index];
+	Transform *object_motion_pass = state->object_motion_pass;
 
 	Mesh *mesh = ob->mesh;
 	uint flag = 0;
@@ -344,69 +417,72 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 		}
 	}
 
-	/* Pack in texture. */
-	int offset = object_index*OBJECT_SIZE;
+	kobject.tfm = tfm;
+	kobject.itfm = itfm;
+	kobject.surface_area = surface_area;
+	kobject.pass_id = pass_id;
+	kobject.random_number = random_number;
+	kobject.particle_index = particle_index;
+	kobject.motion_offset = 0;
 
-	/* OBJECT_TRANSFORM */
-	memcpy(&objects[offset], &tfm, sizeof(float4)*3);
-	/* OBJECT_INVERSE_TRANSFORM */
-	memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
-	/* OBJECT_PROPERTIES */
-	objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
+	if(mesh->use_motion_blur) {
+		state->have_motion = true;
+	}
+	if(mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+		flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+	}
 
 	if(state->need_motion == Scene::MOTION_PASS) {
-		/* Motion transformations, is world/object space depending if mesh
-		 * comes with deformed position in object space, or if we transform
-		 * the shading point in world space.
-		 */
-		MotionTransform mtfm = ob->motion;
-
-		/* In case of missing motion information for previous/next frame,
-		 * assume there is no motion. */
-		if(!ob->use_motion || mtfm.pre == transform_empty()) {
-			mtfm.pre = ob->tfm;
+		/* Clear motion array if there is no actual motion. */
+		ob->update_motion();
+
+		/* Compute motion transforms. */
+		Transform tfm_pre, tfm_post;
+		if(ob->use_motion()) {
+			tfm_pre = ob->motion[0];
+			tfm_post = ob->motion[ob->motion.size() - 1];
 		}
-		if(!ob->use_motion || mtfm.post == transform_empty()) {
-			mtfm.post = ob->tfm;
+		else {
+			tfm_pre = tfm;
+			tfm_post = tfm;
 		}
 
+		/* Motion transformations, is world/object space depending if mesh
+		 * comes with deformed position in object space, or if we transform
+		 * the shading point in world space. */
 		if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
-			mtfm.pre = mtfm.pre * itfm;
-			mtfm.post = mtfm.post * itfm;
-		}
-		else {
-			flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+			tfm_pre = tfm_pre * itfm;
+			tfm_post = tfm_post * itfm;
 		}
 
-		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm.pre, sizeof(float4)*3);
-		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm.post, sizeof(float4)*3);
+		int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE;
+		object_motion_pass[motion_pass_offset + 0] = tfm_pre;
+		object_motion_pass[motion_pass_offset + 1] = tfm_post;
 	}
-#ifdef __OBJECT_MOTION__
 	else if(state->need_motion == Scene::MOTION_BLUR) {
-		if(ob->use_motion) {
-			/* decompose transformations for interpolation. */
-			DecompMotionTransform decomp;
+		if(ob->use_motion()) {
+			kobject.motion_offset = state->motion_offset[object_index];
 
-			transform_motion_decompose(&decomp, &ob->motion, &ob->tfm);
-			memcpy(&objects[offset], &decomp, sizeof(float4)*8);
+			/* Decompose transforms for interpolation. */
+			DecomposedTransform *decomp = state->object_motion + kobject.motion_offset;
+			transform_motion_decompose(decomp, ob->motion.data(), ob->motion.size());
 			flag |= SD_OBJECT_MOTION;
 			state->have_motion = true;
 		}
 	}
-#endif
-
-	if(mesh->use_motion_blur) {
-		state->have_motion = true;
-	}
 
 	/* Dupli object coords and motion info. */
+	kobject.dupli_generated[0] = ob->dupli_generated[0];
+	kobject.dupli_generated[1] = ob->dupli_generated[1];
+	kobject.dupli_generated[2] = ob->dupli_generated[2];
+	kobject.numkeys = mesh->curve_keys.size();
+	kobject.dupli_uv[0] = ob->dupli_uv[0];
+	kobject.dupli_uv[1] = ob->dupli_uv[1];
 	int totalsteps = mesh->motion_steps;
-	int numsteps = (totalsteps - 1)/2;
-	int numverts = mesh->verts.size();
-	int numkeys = mesh->curve_keys.size();
-
-	objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
-	objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
+	kobject.numsteps = (totalsteps - 1)/2;
+	kobject.numverts = mesh->verts.size();
+	kobject.patch_map_offset = 0;
+	kobject.attribute_map_offset = 0;
 
 	/* Object flag. */
 	if(ob->use_holdout) {
@@ -421,7 +497,7 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 }
 
 bool ObjectManager::device_update_object_transform_pop_work(
-        UpdateObejctTransformState *state,
+        UpdateObjectTransformState *state,
         int *start_index,
         int *num_objects)
 {
@@ -446,7 +522,7 @@ bool ObjectManager::device_update_object_transform_pop_work(
 }
 
 void ObjectManager::device_update_object_transform_task(
-        UpdateObejctTransformState *state)
+        UpdateObjectTransformState *state)
 {
 	int start_index, num_objects;
 	while(device_update_object_transform_pop_work(state,
@@ -461,26 +537,40 @@ void ObjectManager::device_update_object_transform_task(
 	}
 }
 
-void ObjectManager::device_update_transforms(Device *device,
-                                             DeviceScene *dscene,
+void ObjectManager::device_update_transforms(DeviceScene *dscene,
                                              Scene *scene,
-                                             uint *object_flag,
                                              Progress& progress)
 {
-	UpdateObejctTransformState state;
-	state.need_motion = scene->need_motion(device->info.advanced_shading);
+	UpdateObjectTransformState state;
+	state.need_motion = scene->need_motion();
 	state.have_motion = false;
 	state.have_curves = false;
 	state.scene = scene;
 	state.queue_start_object = 0;
 
-	state.object_flag = object_flag;
-	state.objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size());
+	state.objects = dscene->objects.alloc(scene->objects.size());
+	state.object_flag = dscene->object_flag.alloc(scene->objects.size());
+	state.object_motion = NULL;
+	state.object_motion_pass = NULL;
+
 	if(state.need_motion == Scene::MOTION_PASS) {
-		state.objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
+		state.object_motion_pass = dscene->object_motion_pass.alloc(OBJECT_MOTION_PASS_SIZE*scene->objects.size());
 	}
-	else {
-		state.objects_vector = NULL;
+	else if(state.need_motion == Scene::MOTION_BLUR) {
+		/* Set object offsets into global object motion array. */
+		uint *motion_offsets = state.motion_offset.resize(scene->objects.size());
+		uint motion_offset = 0;
+
+		foreach(Object *ob, scene->objects) {
+			*motion_offsets = motion_offset;
+			motion_offsets++;
+
+			/* Clear motion array if there is no actual motion. */
+			ob->update_motion();
+			motion_offset += ob->motion.size();
+		}
+
+		state.object_motion = dscene->object_motion.alloc(motion_offset);
 	}
 
 	/* Particle system device offsets
@@ -521,9 +611,12 @@ void ObjectManager::device_update_transforms(Device *device,
 		}
 	}
 
-	device->tex_alloc("__objects", dscene->objects);
+	dscene->objects.copy_to_device();
 	if(state.need_motion == Scene::MOTION_PASS) {
-		device->tex_alloc("__objects_vector", dscene->objects_vector);
+		dscene->object_motion_pass.copy_to_device();
+	}
+	else if(state.need_motion == Scene::MOTION_BLUR) {
+		dscene->object_motion.copy_to_device();
 	}
 
 	dscene->data.bvh.have_motion = state.have_motion;
@@ -543,12 +636,9 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 	if(scene->objects.size() == 0)
 		return;
 
-	/* object info flag */
-	uint *object_flag = dscene->object_flag.resize(scene->objects.size());
-
 	/* set object transform matrices, before applying static transforms */
 	progress.set_status("Updating Objects", "Copying Transformations to device");
-	device_update_transforms(device, dscene, scene, object_flag, progress);
+	device_update_transforms(dscene, scene, progress);
 
 	if(progress.get_cancel()) return;
 
@@ -556,11 +646,11 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 	/* todo: do before to support getting object level coords? */
 	if(scene->params.bvh_type == SceneParams::BVH_STATIC) {
 		progress.set_status("Updating Objects", "Applying Static Transformations");
-		apply_static_transforms(dscene, scene, object_flag, progress);
+		apply_static_transforms(dscene, scene, progress);
 	}
 }
 
-void ObjectManager::device_update_flags(Device *device,
+void ObjectManager::device_update_flags(Device *,
                                         DeviceScene *dscene,
                                         Scene *scene,
                                         Progress& /*progress*/,
@@ -575,9 +665,10 @@ void ObjectManager::device_update_flags(Device *device,
 	if(scene->objects.size() == 0)
 		return;
 
-	/* object info flag */
-	uint *object_flag = dscene->object_flag.get_data();
+	/* Object info flag. */
+	uint *object_flag = dscene->object_flag.data();
 
+	/* Object volume intersection. */
 	vector<Object *> volume_objects;
 	bool has_volume_objects = false;
 	foreach(Object *object, scene->objects) {
@@ -593,9 +684,22 @@ void ObjectManager::device_update_flags(Device *device,
 	foreach(Object *object, scene->objects) {
 		if(object->mesh->has_volume) {
 			object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
+			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+
+			foreach(Attribute& attr, object->mesh->attributes.attributes) {
+				if(attr.element == ATTR_ELEMENT_VOXEL) {
+					object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+				}
+			}
 		}
 		else {
-			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME;
+			object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
+		}
+		if(object->is_shadow_catcher) {
+			object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+		}
+		else {
+			object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
 		}
 
 		if(bounds_valid) {
@@ -618,72 +722,65 @@ void ObjectManager::device_update_flags(Device *device,
 		++object_index;
 	}
 
-	/* allocate object flag */
-	device->tex_alloc("__object_flag", dscene->object_flag);
+	/* Copy object flag. */
+	dscene->object_flag.copy_to_device();
 }
 
-void ObjectManager::device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene)
+void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene)
 {
-	if(scene->objects.size() == 0) {
+	if(dscene->objects.size() == 0) {
 		return;
 	}
 
-	uint4* objects = (uint4*)dscene->objects.get_data();
+	KernelObject *kobjects = dscene->objects.data();
 
 	bool update = false;
-
 	int object_index = 0;
-	foreach(Object *object, scene->objects) {
-		int offset = object_index*OBJECT_SIZE + 11;
 
+	foreach(Object *object, scene->objects) {
 		Mesh* mesh = object->mesh;
 
 		if(mesh->patch_table) {
 			uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() -
 			                           mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset;
 
-			if(objects[offset].x != patch_map_offset) {
-				objects[offset].x = patch_map_offset;
+			if(kobjects[object_index].patch_map_offset != patch_map_offset) {
+				kobjects[object_index].patch_map_offset = patch_map_offset;
 				update = true;
 			}
 		}
 
+		if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) {
+			kobjects[object_index].attribute_map_offset = mesh->attr_map_offset;
+			update = true;
+		}
+
 		object_index++;
 	}
 
 	if(update) {
-		device->tex_free(dscene->objects);
-		device->tex_alloc("__objects", dscene->objects);
+		dscene->objects.copy_to_device();
 	}
 }
 
-void ObjectManager::device_free(Device *device, DeviceScene *dscene)
+void ObjectManager::device_free(Device *, DeviceScene *dscene)
 {
-	device->tex_free(dscene->objects);
-	dscene->objects.clear();
-
-	device->tex_free(dscene->objects_vector);
-	dscene->objects_vector.clear();
-
-	device->tex_free(dscene->object_flag);
-	dscene->object_flag.clear();
+	dscene->objects.free();
+	dscene->object_motion_pass.free();
+	dscene->object_motion.free();
+	dscene->object_flag.free();
 }
 
-void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress)
+void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	/* todo: normals and displacement should be done before applying transform! */
 	/* todo: create objects/meshes in right order! */
 
 	/* counter mesh users */
 	map<Mesh*, int> mesh_users;
-#ifdef __OBJECT_MOTION__
 	Scene::MotionType need_motion = scene->need_motion();
 	bool motion_blur = need_motion == Scene::MOTION_BLUR;
 	bool apply_to_motion = need_motion != Scene::MOTION_PASS;
-#else
-	bool motion_blur = false;
-	bool apply_to_motion = false;
-#endif
 	int i = 0;
 	bool have_instancing = false;
 
@@ -698,6 +795,8 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 
 	if(progress.get_cancel()) return;
 
+	uint *object_flag = dscene->object_flag.data();
+
 	/* apply transforms for objects with single user meshes */
 	foreach(Object *object, scene->objects) {
 		/* Annoying feedback loop here: we can't use is_instanced() because
@@ -708,7 +807,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 		if((mesh_users[object->mesh] == 1 && !object->mesh->has_surface_bssrdf) &&
 		   !object->mesh->has_true_displacement() && object->mesh->subdivision_type == Mesh::SUBDIVISION_NONE)
 		{
-			if(!(motion_blur && object->use_motion)) {
+			if(!(motion_blur && object->use_motion())) {
 				if(!object->mesh->transform_applied) {
 					object->apply_transform(apply_to_motion);
 					object->mesh->transform_applied = true;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7e306fab2a8..c7212ae25f9 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -17,14 +17,14 @@
 #ifndef __OBJECT_H__
 #define __OBJECT_H__
 
-#include "node.h"
-#include "scene.h"
+#include "graph/node.h"
+#include "render/scene.h"
 
-#include "util_boundbox.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -35,12 +35,13 @@ class ParticleSystem;
 class Progress;
 class Scene;
 struct Transform;
+struct UpdateObjectTransformState;
 
 /* Object */
 
 class Object : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	Mesh *mesh;
 	Transform tfm;
@@ -49,17 +50,17 @@ public:
 	int pass_id;
 	vector<ParamValue> attributes;
 	uint visibility;
-	MotionTransform motion;
-	bool use_motion;
+	array<Transform> motion;
 	bool hide_on_missing_motion;
 	bool use_holdout;
+	bool is_shadow_catcher;
 
 	float3 dupli_generated;
 	float2 dupli_uv;
 
 	ParticleSystem *particle_system;
 	int particle_index;
-	
+
 	Object();
 	~Object();
 
@@ -68,12 +69,22 @@ public:
 	void compute_bounds(bool motion_blur);
 	void apply_transform(bool apply_to_motion);
 
-	vector<float> motion_times();
+	/* Convert between normalized -1..1 motion time and index
+	 * in the motion array. */
+	bool use_motion() const;
+	float motion_time(int step) const;
+	int motion_step(float time) const;
+	void update_motion();
 
 	/* Check whether object is traceable and it worth adding it to
 	 * kernel scene.
 	 */
-	bool is_traceable();
+	bool is_traceable() const;
+
+	/* Combine object's visibility with all possible internal run-time
+	 * determined flags which denotes trace-time visibility.
+	 */
+	uint visibility_for_tracing() const;
 };
 
 /* Object Manager */
@@ -87,10 +98,8 @@ public:
 	~ObjectManager();
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_transforms(Device *device,
-	                              DeviceScene *dscene,
+	void device_update_transforms(DeviceScene *dscene,
 	                              Scene *scene,
-	                              uint *object_flag,
 	                              Progress& progress);
 
 	void device_update_flags(Device *device,
@@ -98,61 +107,21 @@ public:
 	                         Scene *scene,
 	                         Progress& progress,
 	                         bool bounds_valid = true);
-	void device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene);
+	void device_update_mesh_offsets(Device *device, DeviceScene *dscene, Scene *scene);
 
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
 
-	void apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+	void apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress);
 
 protected:
-	/* Global state of object transform update. */
-	struct UpdateObejctTransformState {
-		/* Global state used by device_update_object_transform().
-		 * Common for both threaded and non-threaded update.
-		 */
-
-		/* Type of the motion required by the scene settings. */
-		Scene::MotionType need_motion;
-
-		/* Mapping from particle system to a index in packed particle array.
-		 * Only used for read.
-		 */
-		map<ParticleSystem*, int> particle_offset;
-
-		/* Mesh area.
-		 * Used to avoid calculation of mesh area multiple times. Used for both
-		 * read and write. Acquire surface_area_lock to keep it all thread safe.
-		 */
-		map<Mesh*, float> surface_area_map;
-
-		/* Packed object arrays. Those will be filled in. */
-		uint *object_flag;
-		float4 *objects;
-		float4 *objects_vector;
-
-		/* Flags which will be synchronized to Integrator. */
-		bool have_motion;
-		bool have_curves;
-
-		/* ** Scheduling queue. ** */
-
-		Scene *scene;
-
-		/* Some locks to keep everything thread-safe. */
-		thread_spin_lock queue_lock;
-		thread_spin_lock surface_area_lock;
-
-		/* First unused object index in the queue. */
-		int queue_start_object;
-	};
-	void device_update_object_transform(UpdateObejctTransformState *state,
+	void device_update_object_transform(UpdateObjectTransformState *state,
 	                                    Object *ob,
 	                                    const int object_index);
-	void device_update_object_transform_task(UpdateObejctTransformState *state);
+	void device_update_object_transform_task(UpdateObjectTransformState *state);
 	bool device_update_object_transform_pop_work(
-	        UpdateObejctTransformState *state,
+	        UpdateObjectTransformState *state,
 	        int *start_index,
 	        int *num_objects);
 };
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 67b68e63cb2..f1a22350060 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -14,26 +14,27 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "graph.h"
-#include "light.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "nodes.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/nodes.h"
 
 #ifdef WITH_OSL
 
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_projection.h"
 
 #endif
 
@@ -156,6 +157,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 	og->surface_state.clear();
 	og->volume_state.clear();
 	og->displacement_state.clear();
+	og->bump_state.clear();
 	og->background_state.reset();
 }
 
@@ -232,16 +234,25 @@ void OSLShaderManager::shading_system_init()
 			"glossy",			/* PATH_RAY_GLOSSY */
 			"singular",			/* PATH_RAY_SINGULAR */
 			"transparent",		/* PATH_RAY_TRANSPARENT */
-			"shadow",			/* PATH_RAY_SHADOW_OPAQUE */
-			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT */
+
+			"shadow",			/* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_OPAQUE_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
 
 			"__unused__",
+			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
+			"__unused__",
+
 			"__unused__",
 			"diffuse_ancestor",	/* PATH_RAY_DIFFUSE_ANCESTOR */
 			"__unused__",
 			"__unused__",
-			"__unused__",		/* PATH_RAY_SINGLE_PASS_DONE */
-			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
+			"__unused__",
+			"__unused__",
+			"__unused__",
+			"__unused__",
+			"__unused__",
 		};
 
 		const int nraytypes = sizeof(raytypes)/sizeof(raytypes[0]);
@@ -718,6 +729,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 				current_shader->has_surface_bssrdf = true;
 				current_shader->has_bssrdf_bump = true; /* can't detect yet */
 			}
+			current_shader->has_bump = true; /* can't detect yet */
 		}
 
 		if(node->has_spatial_varying()) {
@@ -733,6 +745,10 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 		current_shader->has_object_dependency = true;
 	}
 
+	if(node->has_attribute_dependency()) {
+		current_shader->has_attribute_dependency = true;
+	}
+
 	if(node->has_integrator_dependency()) {
 		current_shader->has_integrator_dependency = true;
 	}
@@ -817,7 +833,9 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name)
 		case SocketType::TRANSFORM:
 		{
 			Transform value = node->get_transform(socket);
-			ss->Parameter(uname, TypeDesc::TypeMatrix, &value);
+			ProjectionTransform projection(value);
+			projection = projection_transpose(projection);
+			ss->Parameter(uname, TypeDesc::TypeMatrix, &projection);
 			break;
 		}
 		case SocketType::BOOLEAN_ARRAY:
@@ -885,7 +903,11 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name)
 		case SocketType::TRANSFORM_ARRAY:
 		{
 			const array<Transform>& value = node->get_transform_array(socket);
-			ss->Parameter(uname, array_typedesc(TypeDesc::TypeMatrix, value.size()), value.data());
+			array<ProjectionTransform> fvalue(value.size());
+			for(size_t i = 0; i < value.size(); i++) {
+				fvalue[i] = projection_transpose(ProjectionTransform(value[i]));
+			}
+			ss->Parameter(uname, array_typedesc(TypeDesc::TypeMatrix, fvalue.size()), fvalue.data());
 			break;
 		}
 		case SocketType::CLOSURE:
@@ -952,7 +974,9 @@ void OSLCompiler::parameter(const char *name, ustring s)
 void OSLCompiler::parameter(const char *name, const Transform& tfm)
 {
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)shadingsys;
-	ss->Parameter(name, TypeDesc::TypeMatrix, (float*)&tfm);
+	ProjectionTransform projection(tfm);
+	projection = projection_transpose(projection);
+	ss->Parameter(name, TypeDesc::TypeMatrix, (float*)&projection);
 }
 
 void OSLCompiler::parameter_array(const char *name, const float f[], int arraylen)
@@ -980,6 +1004,14 @@ void OSLCompiler::parameter_color_array(const char *name, const array<float3>& f
 	ss->Parameter(name, type, table.data());
 }
 
+void OSLCompiler::parameter_attribute(const char *name, ustring s)
+{
+	if(Attribute::name_standard(s.c_str()))
+		parameter(name, (string("geom:") + s.c_str()).c_str());
+	else
+		parameter(name, s.c_str());
+}
+
 void OSLCompiler::find_dependencies(ShaderNodeSet& dependencies, ShaderInput *input)
 {
 	ShaderNode *node = (input->link)? input->link->parent: NULL;
@@ -1026,6 +1058,9 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet& nodes)
 							if(node->has_bssrdf_bump())
 								current_shader->has_bssrdf_bump = true;
 						}
+						if(node->has_bump()) {
+							current_shader->has_bump = true;
+						}
 					}
 					else if(current_type == SHADER_TYPE_VOLUME) {
 						if(node->has_spatial_varying())
@@ -1088,23 +1123,14 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		ShaderGraph *graph = shader->graph;
 		ShaderNode *output = (graph)? graph->output(): NULL;
 
-		/* copy graph for shader with bump mapping */
-		if(output->input("Surface")->link && output->input("Displacement")->link)
-			if(!shader->graph_bump)
-				shader->graph_bump = shader->graph->copy();
+		bool has_bump = (shader->displacement_method != DISPLACE_TRUE) &&
+		                output->input("Surface")->link && output->input("Displacement")->link;
 
 		/* finalize */
 		shader->graph->finalize(scene,
-		                        false,
-		                        true,
-		                        shader->has_integrator_dependency);
-		if(shader->graph_bump) {
-			shader->graph_bump->finalize(scene,
-			                             true,
-			                             true,
-			                             shader->has_integrator_dependency,
-			                             shader->displacement_method == DISPLACE_BOTH);
-		}
+		                        has_bump,
+		                        shader->has_integrator_dependency,
+		                        shader->displacement_method == DISPLACE_BOTH);
 
 		current_shader = shader;
 
@@ -1112,20 +1138,22 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		shader->has_surface_emission = false;
 		shader->has_surface_transparent = false;
 		shader->has_surface_bssrdf = false;
-		shader->has_bssrdf_bump = false;
+		shader->has_bump = has_bump;
+		shader->has_bssrdf_bump = has_bump;
 		shader->has_volume = false;
 		shader->has_displacement = false;
 		shader->has_surface_spatial_varying = false;
 		shader->has_volume_spatial_varying = false;
 		shader->has_object_dependency = false;
+		shader->has_attribute_dependency = false;
 		shader->has_integrator_dependency = false;
 
 		/* generate surface shader */
 		if(shader->used && graph && output->input("Surface")->link) {
 			shader->osl_surface_ref = compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
 
-			if(shader->graph_bump && shader->displacement_method != DISPLACE_TRUE)
-				shader->osl_surface_bump_ref = compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP);
+			if(has_bump)
+				shader->osl_surface_bump_ref = compile_type(shader, shader->graph, SHADER_TYPE_BUMP);
 			else
 				shader->osl_surface_bump_ref = OSL::ShaderGroupRef();
 
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index b131b672b8c..95e35dd857b 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -17,13 +17,13 @@
 #ifndef __OSL_H__
 #define __OSL_H__
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/shader.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslcomp.h>
@@ -140,6 +140,8 @@ public:
 	void parameter_array(const char *name, const float f[], int arraylen);
 	void parameter_color_array(const char *name, const array<float3>& f);
 
+	void parameter_attribute(const char *name, ustring s);
+
 	ShaderType output_type() { return current_type; }
 
 	bool background;
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 1a35d60fb4b..e4be3306d7e 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "particles.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/particles.h"
+#include "render/scene.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -52,7 +53,7 @@ ParticleSystemManager::~ParticleSystemManager()
 {
 }
 
-void ParticleSystemManager::device_update_particles(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void ParticleSystemManager::device_update_particles(Device *, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	/* count particles.
 	 * adds one dummy particle at the beginning to avoid invalid lookups,
@@ -61,14 +62,10 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 	for(size_t j = 0; j < scene->particle_systems.size(); j++)
 		num_particles += scene->particle_systems[j]->particles.size();
 	
-	float4 *particles = dscene->particles.resize(PARTICLE_SIZE*num_particles);
+	KernelParticle *kparticles = dscene->particles.alloc(num_particles);
 	
 	/* dummy particle */
-	particles[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	particles[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	particles[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	particles[3] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	particles[4] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	memset(kparticles, 0, sizeof(KernelParticle));
 	
 	int i = 1;
 	for(size_t j = 0; j < scene->particle_systems.size(); j++) {
@@ -77,13 +74,15 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 		for(size_t k = 0; k < psys->particles.size(); k++) {
 			/* pack in texture */
 			Particle& pa = psys->particles[k];
-			int offset = i*PARTICLE_SIZE;
 			
-			particles[offset] = make_float4(pa.index, pa.age, pa.lifetime, pa.size);
-			particles[offset+1] = pa.rotation;
-			particles[offset+2] = make_float4(pa.location.x, pa.location.y, pa.location.z, pa.velocity.x);
-			particles[offset+3] = make_float4(pa.velocity.y, pa.velocity.z, pa.angular_velocity.x, pa.angular_velocity.y);
-			particles[offset+4] = make_float4(pa.angular_velocity.z, 0.0f, 0.0f, 0.0f);
+			kparticles[i].index = pa.index;
+			kparticles[i].age = pa.age;
+			kparticles[i].lifetime = pa.lifetime;
+			kparticles[i].size = pa.size;
+			kparticles[i].rotation = pa.rotation;
+			kparticles[i].location = float3_to_float4(pa.location);
+			kparticles[i].velocity = float3_to_float4(pa.velocity);
+			kparticles[i].angular_velocity = float3_to_float4(pa.angular_velocity);
 			
 			i++;
 			
@@ -91,7 +90,7 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 		}
 	}
 	
-	device->tex_alloc("__particles", dscene->particles);
+	dscene->particles.copy_to_device();
 }
 
 void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
@@ -112,10 +111,9 @@ void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, S
 	need_update = false;
 }
 
-void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene)
+void ParticleSystemManager::device_free(Device *, DeviceScene *dscene)
 {
-	device->tex_free(dscene->particles);
-	dscene->particles.clear();
+	dscene->particles.free();
 }
 
 void ParticleSystemManager::tag_update(Scene * /*scene*/)
diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h
index 2509e27b44b..66d46114b3e 100644
--- a/intern/cycles/render/particles.h
+++ b/intern/cycles/render/particles.h
@@ -17,8 +17,8 @@
 #ifndef __PARTICLES_H__
 #define __PARTICLES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 68124e78cb5..ba47e3ab6f8 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -16,37 +16,78 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "bake.h"
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "osl.h"
-#include "particles.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
-#include "util_guarded_allocator.h"
-#include "util_logging.h"
-#include "util_progress.h"
+#include "render/background.h"
+#include "render/bake.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/particles.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
-Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
-: params(params_)
+DeviceScene::DeviceScene(Device *device)
+: bvh_nodes(device, "__bvh_nodes", MEM_TEXTURE),
+  bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_TEXTURE),
+  object_node(device, "__object_node", MEM_TEXTURE),
+  prim_tri_index(device, "__prim_tri_index", MEM_TEXTURE),
+  prim_tri_verts(device, "__prim_tri_verts", MEM_TEXTURE),
+  prim_type(device, "__prim_type", MEM_TEXTURE),
+  prim_visibility(device, "__prim_visibility", MEM_TEXTURE),
+  prim_index(device, "__prim_index", MEM_TEXTURE),
+  prim_object(device, "__prim_object", MEM_TEXTURE),
+  prim_time(device, "__prim_time", MEM_TEXTURE),
+  tri_shader(device, "__tri_shader", MEM_TEXTURE),
+  tri_vnormal(device, "__tri_vnormal", MEM_TEXTURE),
+  tri_vindex(device, "__tri_vindex", MEM_TEXTURE),
+  tri_patch(device, "__tri_patch", MEM_TEXTURE),
+  tri_patch_uv(device, "__tri_patch_uv", MEM_TEXTURE),
+  curves(device, "__curves", MEM_TEXTURE),
+  curve_keys(device, "__curve_keys", MEM_TEXTURE),
+  patches(device, "__patches", MEM_TEXTURE),
+  objects(device, "__objects", MEM_TEXTURE),
+  object_motion_pass(device, "__object_motion_pass", MEM_TEXTURE),
+  object_motion(device, "__object_motion", MEM_TEXTURE),
+  object_flag(device, "__object_flag", MEM_TEXTURE),
+  camera_motion(device, "__camera_motion", MEM_TEXTURE),
+  attributes_map(device, "__attributes_map", MEM_TEXTURE),
+  attributes_float(device, "__attributes_float", MEM_TEXTURE),
+  attributes_float3(device, "__attributes_float3", MEM_TEXTURE),
+  attributes_uchar4(device, "__attributes_uchar4", MEM_TEXTURE),
+  light_distribution(device, "__light_distribution", MEM_TEXTURE),
+  lights(device, "__lights", MEM_TEXTURE),
+  light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_TEXTURE),
+  light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_TEXTURE),
+  particles(device, "__particles", MEM_TEXTURE),
+  svm_nodes(device, "__svm_nodes", MEM_TEXTURE),
+  shaders(device, "__shaders", MEM_TEXTURE),
+  lookup_table(device, "__lookup_table", MEM_TEXTURE),
+  sobol_directions(device, "__sobol_directions", MEM_TEXTURE)
+{
+	memset(&data, 0, sizeof(data));
+}
+
+Scene::Scene(const SceneParams& params_, Device *device)
+: device(device), dscene(device), params(params_)
 {
-	device = NULL;
 	memset(&dscene.data, 0, sizeof(dscene.data));
 
 	camera = new Camera();
+	dicing_camera = new Camera();
 	lookup_tables = new LookupTables();
 	film = new Film();
 	background = new Background();
@@ -54,13 +95,13 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
 	mesh_manager = new MeshManager();
 	object_manager = new ObjectManager();
 	integrator = new Integrator();
-	image_manager = new ImageManager(device_info_);
+	image_manager = new ImageManager(device->info);
 	particle_system_manager = new ParticleSystemManager();
 	curve_system_manager = new CurveSystemManager();
 	bake_manager = new BakeManager();
 
 	/* OSL only works on the CPU */
-	if(device_info_.type == DEVICE_CPU)
+	if(device->info.has_osl)
 		shader_manager = ShaderManager::create(this, params.shadingsystem);
 	else
 		shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM);
@@ -107,9 +148,9 @@ void Scene::free_memory(bool final)
 		bake_manager->device_free(device, &dscene);
 
 		if(!params.persistent_data || final)
-			image_manager->device_free(device, &dscene);
+			image_manager->device_free(device);
 		else
-			image_manager->device_free_builtin(device, &dscene);
+			image_manager->device_free_builtin(device);
 
 		lookup_tables->device_free(device, &dscene);
 	}
@@ -117,6 +158,7 @@ void Scene::free_memory(bool final)
 	if(final) {
 		delete lookup_tables;
 		delete camera;
+		delete dicing_camera;
 		delete film;
 		delete background;
 		delete integrator;
@@ -148,8 +190,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 	 * - Film needs light manager to run for use_light_visibility
 	 * - Lookup tables are done a second time to handle film tables
 	 */
-	
-	image_manager->set_pack_images(device->info.pack_images);
 
 	progress.set_status("Updating Shaders");
 	shader_manager->device_update(device, &dscene, this, progress);
@@ -166,8 +206,7 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel() || device->have_error()) return;
 
-	progress.set_status("Updating Meshes Flags");
-	mesh_manager->device_update_flags(device, &dscene, this, progress);
+	mesh_manager->device_update_preprocess(device, this, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
 
@@ -176,6 +215,11 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel() || device->have_error()) return;
 
+	progress.set_status("Updating Particle Systems");
+	particle_system_manager->device_update(device, &dscene, this, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
+
 	progress.set_status("Updating Meshes");
 	mesh_manager->device_update(device, &dscene, this, progress);
 
@@ -187,7 +231,7 @@ void Scene::device_update(Device *device_, Progress& progress)
 	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Images");
-	image_manager->device_update(device, &dscene, this, progress);
+	image_manager->device_update(device, this, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
 
@@ -211,11 +255,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel() || device->have_error()) return;
 
-	progress.set_status("Updating Particle Systems");
-	particle_system_manager->device_update(device, &dscene, this, progress);
-
-	if(progress.get_cancel() || device->have_error()) return;
-
 	progress.set_status("Updating Integrator");
 	integrator->device_update(device, &dscene, this);
 
@@ -253,10 +292,10 @@ void Scene::device_update(Device *device_, Progress& progress)
 	}
 }
 
-Scene::MotionType Scene::need_motion(bool advanced_shading)
+Scene::MotionType Scene::need_motion()
 {
 	if(integrator->motion_blur)
-		return (advanced_shading)? MOTION_BLUR: MOTION_NONE;
+		return MOTION_BLUR;
 	else if(Pass::contains(film->passes, PASS_MOTION))
 		return MOTION_PASS;
 	else
@@ -323,6 +362,7 @@ void Scene::reset()
 
 	/* ensure all objects are updated */
 	camera->tag_update();
+	dicing_camera->tag_update();
 	film->tag_update(this);
 	background->tag_update(this);
 	integrator->tag_update(this);
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 9f398c444f4..04bd4735a86 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -17,18 +17,20 @@
 #ifndef __SCENE_H__
 #define __SCENE_H__
 
-#include "image.h"
-#include "shader.h"
+#include "bvh/bvh_params.h"
 
-#include "device_memory.h"
+#include "render/image.h"
+#include "render/shader.h"
 
-#include "util_param.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_texture.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "device/device_memory.h"
+
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_texture.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -60,15 +62,15 @@ class BakeData;
 class DeviceScene {
 public:
 	/* BVH */
-	device_vector<float4> bvh_nodes;
-	device_vector<float4> bvh_leaf_nodes;
-	device_vector<uint> object_node;
+	device_vector<int4> bvh_nodes;
+	device_vector<int4> bvh_leaf_nodes;
+	device_vector<int> object_node;
 	device_vector<uint> prim_tri_index;
 	device_vector<float4> prim_tri_verts;
-	device_vector<uint> prim_type;
+	device_vector<int> prim_type;
 	device_vector<uint> prim_visibility;
-	device_vector<uint> prim_index;
-	device_vector<uint> prim_object;
+	device_vector<int> prim_index;
+	device_vector<int> prim_object;
 	device_vector<float2> prim_time;
 
 	/* mesh */
@@ -84,8 +86,13 @@ public:
 	device_vector<uint> patches;
 
 	/* objects */
-	device_vector<float4> objects;
-	device_vector<float4> objects_vector;
+	device_vector<KernelObject> objects;
+	device_vector<Transform> object_motion_pass;
+	device_vector<DecomposedTransform> object_motion;
+	device_vector<uint> object_flag;
+
+	/* cameras */
+	device_vector<DecomposedTransform> camera_motion;
 
 	/* attributes */
 	device_vector<uint4> attributes_map;
@@ -94,18 +101,17 @@ public:
 	device_vector<uchar4> attributes_uchar4;
 
 	/* lights */
-	device_vector<float4> light_distribution;
-	device_vector<float4> light_data;
+	device_vector<KernelLightDistribution> light_distribution;
+	device_vector<KernelLight> lights;
 	device_vector<float2> light_background_marginal_cdf;
 	device_vector<float2> light_background_conditional_cdf;
 
 	/* particles */
-	device_vector<float4> particles;
+	device_vector<KernelParticle> particles;
 
 	/* shaders */
-	device_vector<uint4> svm_nodes;
-	device_vector<uint> shader_flag;
-	device_vector<uint> object_flag;
+	device_vector<int4> svm_nodes;
+	device_vector<KernelShader> shaders;
 
 	/* lookup tables */
 	device_vector<float> lookup_table;
@@ -113,61 +119,72 @@ public:
 	/* integrator */
 	device_vector<uint> sobol_directions;
 
-	/* cpu images */
-	device_vector<uchar4> tex_byte4_image[TEX_NUM_BYTE4_CPU];
-	device_vector<float4> tex_float4_image[TEX_NUM_FLOAT4_CPU];
-	device_vector<float> tex_float_image[TEX_NUM_FLOAT_CPU];
-	device_vector<uchar> tex_byte_image[TEX_NUM_BYTE_CPU];
-	device_vector<half4> tex_half4_image[TEX_NUM_HALF4_CPU];
-	device_vector<half> tex_half_image[TEX_NUM_HALF_CPU];
-
-	/* opencl images */
-	device_vector<uchar4> tex_image_byte4_packed;
-	device_vector<float4> tex_image_float4_packed;
-	device_vector<uchar> tex_image_byte_packed;
-	device_vector<float> tex_image_float_packed;
-	device_vector<uint4> tex_image_packed_info;
-
 	KernelData data;
+
+	DeviceScene(Device *device);
 };
 
 /* Scene Parameters */
 
 class SceneParams {
 public:
-	ShadingSystem shadingsystem;
+	/* Type of BVH, in terms whether it is supported dynamic updates of meshes
+	 * or whether modifying geometry requires full BVH rebuild.
+	 */
 	enum BVHType {
+		/* BVH supports dynamic updates of geometry.
+		 *
+		 * Faster for updating BVH tree when doing modifications in viewport,
+		 * but slower for rendering.
+		 */
 		BVH_DYNAMIC = 0,
+		/* BVH tree is calculated for specific scene, updates in geometry
+		 * requires full tree rebuild.
+		 *
+		 * Slower to update BVH tree when modifying objects in viewport, also
+		 * slower to build final BVH tree but gives best possible render speed.
+		 */
 		BVH_STATIC = 1,
 
 		BVH_NUM_TYPES,
-	} bvh_type;
+	};
+
+	ShadingSystem shadingsystem;
+
+	/* Requested BVH layout.
+	 *
+	 * If it's not supported by the device, the widest one from supported ones
+	 * will be used, but BVH wider than this one will never be used.
+	 */
+	BVHLayout bvh_layout;
+
+	BVHType bvh_type;
 	bool use_bvh_spatial_split;
 	bool use_bvh_unaligned_nodes;
 	int num_bvh_time_steps;
-	bool use_qbvh;
+
 	bool persistent_data;
 	int texture_limit;
 
 	SceneParams()
 	{
 		shadingsystem = SHADINGSYSTEM_SVM;
+		bvh_layout = BVH_LAYOUT_BVH2;
 		bvh_type = BVH_DYNAMIC;
 		use_bvh_spatial_split = false;
 		use_bvh_unaligned_nodes = true;
 		num_bvh_time_steps = 0;
-		use_qbvh = false;
 		persistent_data = false;
 		texture_limit = 0;
 	}
 
 	bool modified(const SceneParams& params)
 	{ return !(shadingsystem == params.shadingsystem
+		&& bvh_layout == params.bvh_layout
 		&& bvh_type == params.bvh_type
 		&& use_bvh_spatial_split == params.use_bvh_spatial_split
 		&& use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes
 		&& num_bvh_time_steps == params.num_bvh_time_steps
-		&& use_qbvh == params.use_qbvh
 		&& persistent_data == params.persistent_data
 		&& texture_limit == params.texture_limit); }
 };
@@ -178,6 +195,7 @@ class Scene {
 public:
 	/* data */
 	Camera *camera;
+	Camera *dicing_camera;
 	LookupTables *lookup_tables;
 	Film *film;
 	Background *background;
@@ -216,7 +234,7 @@ public:
 	/* mutex must be locked manually by callers */
 	thread_mutex mutex;
 
-	Scene(const SceneParams& params, const DeviceInfo& device_info);
+	Scene(const SceneParams& params, Device *device);
 	~Scene();
 
 	void device_update(Device *device, Progress& progress);
@@ -225,7 +243,7 @@ public:
 	void need_global_attributes(AttributeRequestSet& attributes);
 
 	enum MotionType { MOTION_NONE = 0, MOTION_PASS, MOTION_BLUR };
-	MotionType need_motion(bool advanced_shading = true);
+	MotionType need_motion();
 	float motion_shutter_time();
 
 	bool need_update();
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 420866c9436..bb636dd962a 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,24 +17,24 @@
 #include <string.h>
 #include <limits.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,7 +46,7 @@ Session::Session(const SessionParams& params_)
 : params(params_),
   tile_manager(params.progressive, params.samples, params.tile_size, params.start_resolution,
        params.background == false || params.progressive_refine, params.background, params.tile_order,
-       max(params.device.multi_devices.size(), 1)),
+       max(params.device.multi_devices.size(), 1), params.pixel_size),
   stats()
 {
 	device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background);
@@ -55,7 +55,7 @@ Session::Session(const SessionParams& params_)
 
 	device = Device::create(params.device, stats, params.background);
 
-	if(params.background && params.output_path.empty()) {
+	if(params.background && !params.write_render_cb) {
 		buffers = NULL;
 		display = NULL;
 	}
@@ -101,21 +101,22 @@ Session::~Session()
 		wait();
 	}
 
-	if(!params.output_path.empty()) {
+	if(params.write_render_cb) {
 		/* tonemap and write out image if requested */
 		delete display;
 
 		display = new DisplayBuffer(device, false);
-		display->reset(device, buffers->params);
+		display->reset(buffers->params);
 		tonemap(params.samples);
 
-		progress.set_status("Writing Image", params.output_path);
-		display->write(device, params.output_path);
+		int w = display->draw_width;
+		int h = display->draw_height;
+		uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
+		params.write_render_cb((uchar*)pixels, w, h, 4);
 	}
 
 	/* clean up */
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
+	tile_manager.device_free();
 
 	delete buffers;
 	delete display;
@@ -268,8 +269,8 @@ void Session::run_gpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			device->task_wait();
 
@@ -358,30 +359,31 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	thread_scoped_lock tile_lock(tile_mutex);
 
 	/* get next tile from manager */
-	Tile tile;
+	Tile *tile;
 	int device_num = device->device_number(tile_device);
 
 	if(!tile_manager.next_tile(tile, device_num))
 		return false;
 	
 	/* fill render tile */
-	rtile.x = tile_manager.state.buffer.full_x + tile.x;
-	rtile.y = tile_manager.state.buffer.full_y + tile.y;
-	rtile.w = tile.w;
-	rtile.h = tile.h;
+	rtile.x = tile_manager.state.buffer.full_x + tile->x;
+	rtile.y = tile_manager.state.buffer.full_y + tile->y;
+	rtile.w = tile->w;
+	rtile.h = tile->h;
 	rtile.start_sample = tile_manager.state.sample;
 	rtile.num_samples = tile_manager.state.num_samples;
 	rtile.resolution = tile_manager.state.resolution_divider;
+	rtile.tile_index = tile->index;
+	rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
 
 	tile_lock.unlock();
 
 	/* in case of a permanent buffer, return it, otherwise we will allocate
 	 * a new temporary buffer */
-	if(!(params.background && params.output_path.empty())) {
+	if(buffers) {
 		tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
 
 		rtile.buffer = buffers->buffer.device_pointer;
-		rtile.rng_state = buffers->rng_state.device_pointer;
 		rtile.buffers = buffers;
 
 		device->map_tile(tile_device, rtile);
@@ -389,48 +391,24 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 		return true;
 	}
 
-	/* fill buffer parameters */
-	BufferParams buffer_params = tile_manager.params;
-	buffer_params.full_x = rtile.x;
-	buffer_params.full_y = rtile.y;
-	buffer_params.width = rtile.w;
-	buffer_params.height = rtile.h;
-
-	buffer_params.get_offset_stride(rtile.offset, rtile.stride);
-
-	RenderBuffers *tilebuffers;
-
-	/* allocate buffers */
-	if(params.progressive_refine) {
-		tile_lock.lock();
-
-		if(tile_buffers.size() == 0)
-			tile_buffers.resize(tile_manager.state.num_tiles, NULL);
-
-		/* In certain circumstances number of tiles in the tile manager could
-		 * be changed. This is not supported by the progressive refine feature.
-		 */
-		assert(tile_buffers.size() == tile_manager.state.num_tiles);
-
-		tilebuffers = tile_buffers[tile.index];
-		if(tilebuffers == NULL) {
-			tilebuffers = new RenderBuffers(tile_device);
-			tile_buffers[tile.index] = tilebuffers;
-
-			tilebuffers->reset(tile_device, buffer_params);
-		}
-
-		tile_lock.unlock();
+	if(tile->buffers == NULL) {
+		/* fill buffer parameters */
+		BufferParams buffer_params = tile_manager.params;
+		buffer_params.full_x = rtile.x;
+		buffer_params.full_y = rtile.y;
+		buffer_params.width = rtile.w;
+		buffer_params.height = rtile.h;
+
+		/* allocate buffers */
+		tile->buffers = new RenderBuffers(tile_device);
+		tile->buffers->reset(buffer_params);
 	}
-	else {
-		tilebuffers = new RenderBuffers(tile_device);
 
-		tilebuffers->reset(tile_device, buffer_params);
-	}
+	tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
 
-	rtile.buffer = tilebuffers->buffer.device_pointer;
-	rtile.rng_state = tilebuffers->rng_state.device_pointer;
-	rtile.buffers = tilebuffers;
+	rtile.buffer = tile->buffers->buffer.device_pointer;
+	rtile.buffers = tile->buffers;
+	rtile.sample = tile_manager.state.sample;
 
 	/* this will tag tile as IN PROGRESS in blender-side render pipeline,
 	 * which is needed to highlight currently rendering tile before first
@@ -449,7 +427,7 @@ void Session::update_tile_sample(RenderTile& rtile)
 		if(params.progressive_refine == false) {
 			/* todo: optimize this by making it thread safe and removing lock */
 
-			update_render_tile_cb(rtile);
+			update_render_tile_cb(rtile, true);
 		}
 	}
 
@@ -460,20 +438,78 @@ void Session::release_tile(RenderTile& rtile)
 {
 	thread_scoped_lock tile_lock(tile_mutex);
 
-	progress.add_finished_tile();
+	progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
 
-	if(write_render_tile_cb) {
-		if(params.progressive_refine == false) {
-			/* todo: optimize this by making it thread safe and removing lock */
+	bool delete_tile;
+
+	if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+		if(write_render_tile_cb && params.progressive_refine == false) {
 			write_render_tile_cb(rtile);
+		}
 
+		if(delete_tile) {
 			delete rtile.buffers;
+			tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
+		}
+	}
+	else {
+		if(update_render_tile_cb && params.progressive_refine == false) {
+			update_render_tile_cb(rtile, false);
 		}
 	}
 
 	update_status_time();
 }
 
+void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+
+	int center_idx = tiles[4].tile_index;
+	assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+	BufferParams buffer_params = tile_manager.params;
+	int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y,
+	                              buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height);
+
+	for(int dy = -1, i = 0; dy <= 1; dy++) {
+		for(int dx = -1; dx <= 1; dx++, i++) {
+			int px = tiles[4].x + dx*params.tile_size.x;
+			int py = tiles[4].y + dy*params.tile_size.y;
+			if(px >= image_region.x && py >= image_region.y &&
+			   px <  image_region.z && py <  image_region.w) {
+				int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx;
+				Tile *tile = &tile_manager.state.tiles[tile_index];
+				assert(tile->buffers);
+
+				tiles[i].buffer = tile->buffers->buffer.device_pointer;
+				tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
+				tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
+				tiles[i].w = tile->w;
+				tiles[i].h = tile->h;
+				tiles[i].buffers = tile->buffers;
+
+				tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+			}
+			else {
+				tiles[i].buffer = (device_ptr)NULL;
+				tiles[i].buffers = NULL;
+				tiles[i].x = clamp(px, image_region.x, image_region.z);
+				tiles[i].y = clamp(py, image_region.y, image_region.w);
+				tiles[i].w = tiles[i].h = 0;
+			}
+		}
+	}
+
+	assert(tiles[4].buffers);
+	device->map_neighbor_tiles(tile_device, tiles);
+}
+
+void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+	device->unmap_neighbor_tiles(tile_device, tiles);
+}
+
 void Session::run_cpu()
 {
 	bool tiles_written = false;
@@ -558,8 +594,8 @@ void Session::run_cpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			/* update status and timing */
 			update_status_time();
@@ -608,13 +644,11 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	DeviceRequestedFeatures requested_features;
 	requested_features.experimental = params.experimental;
 
-	requested_features.max_closure = get_max_closure_count();
 	scene->shader_manager->get_requested_features(
 	        scene,
 	        &requested_features);
 	if(!params.background) {
 		/* Avoid too much re-compilations for viewport render. */
-		requested_features.max_closure = 64;
 		requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
 		requested_features.nodes_features = NODE_FEATURE_ALL;
 	}
@@ -624,37 +658,46 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	 */
 	requested_features.use_hair = false;
 	requested_features.use_object_motion = false;
-	requested_features.use_camera_motion = scene->camera->use_motion;
+	requested_features.use_camera_motion = scene->camera->use_motion();
 	foreach(Object *object, scene->objects) {
 		Mesh *mesh = object->mesh;
 		if(mesh->num_curves()) {
 			requested_features.use_hair = true;
 		}
-		requested_features.use_object_motion |= object->use_motion | mesh->use_motion_blur;
+		requested_features.use_object_motion |= object->use_motion() | mesh->use_motion_blur;
 		requested_features.use_camera_motion |= mesh->use_motion_blur;
 #ifdef WITH_OPENSUBDIV
 		if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
 			requested_features.use_patch_evaluation = true;
 		}
 #endif
+		if(object->is_shadow_catcher) {
+			requested_features.use_shadow_tricks = true;
+		}
 	}
 
 	BakeManager *bake_manager = scene->bake_manager;
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-	requested_features.use_transparent &= scene->integrator->transparent_shadows;
+	requested_features.use_denoising = params.use_denoising;
 
 	return requested_features;
 }
 
-void Session::load_kernels()
+void Session::load_kernels(bool lock_scene)
 {
-	thread_scoped_lock scene_lock(scene->mutex);
+	thread_scoped_lock scene_lock;
+	if(lock_scene) {
+		scene_lock = thread_scoped_lock(scene->mutex);
+	}
 
-	if(!kernels_loaded) {
+	DeviceRequestedFeatures requested_features = get_requested_device_features();
+
+	if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
-		DeviceRequestedFeatures requested_features = get_requested_device_features();
+		scoped_timer timer;
+
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
 			string message = device->error_message();
@@ -667,7 +710,11 @@ void Session::load_kernels()
 			return;
 		}
 
+		progress.add_skip_time(timer, false);
+		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
+
 		kernels_loaded = true;
+		loaded_kernel_features = requested_features;
 	}
 }
 
@@ -707,11 +754,11 @@ bool Session::draw(BufferParams& buffer_params, DeviceDrawParams &draw_params)
 
 void Session::reset_(BufferParams& buffer_params, int samples)
 {
-	if(buffers) {
-		if(buffer_params.modified(buffers->params)) {
-			gpu_draw_ready = false;
-			buffers->reset(device, buffer_params);
-			display->reset(device, buffer_params);
+	if(buffers && buffer_params.modified(tile_manager.params)) {
+		gpu_draw_ready = false;
+		buffers->reset(buffer_params);
+		if(display) {
+			display->reset(buffer_params);
 		}
 	}
 
@@ -732,15 +779,6 @@ void Session::reset(BufferParams& buffer_params, int samples)
 		reset_gpu(buffer_params, samples);
 	else
 		reset_cpu(buffer_params, samples);
-
-	if(params.progressive_refine) {
-		thread_scoped_lock buffers_lock(buffers_mutex);
-
-		foreach(RenderBuffers *buffers, tile_buffers)
-			delete buffers;
-
-		tile_buffers.clear();
-	}
 }
 
 void Session::set_samples(int samples)
@@ -818,6 +856,18 @@ void Session::update_scene()
 
 	/* update scene */
 	if(scene->need_update()) {
+		load_kernels(false);
+
+		/* Update max_closures. */
+		KernelIntegrator *kintegrator = &scene->dscene.data.integrator;
+		if(params.background) {
+			kintegrator->max_closures = get_max_closure_count();
+		}
+		else {
+			/* Currently viewport render is faster with higher max_closures, needs investigating. */
+			kintegrator->max_closures = 64;
+		}
+
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 	}
@@ -828,7 +878,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	int progressive_sample = tile_manager.state.sample;
 	int num_samples = tile_manager.get_num_effective_samples();
 
-	int tile = tile_manager.state.num_rendered_tiles;
+	int tile = progress.get_rendered_tiles();
 	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
@@ -836,11 +886,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 	if(!params.progressive) {
 		const bool is_cpu = params.device.type == DEVICE_CPU;
-		const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles;
+		const bool rendering_finished = (tile == num_tiles);
+		const bool is_last_tile = (tile + 1) == num_tiles;
 
-		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
+		substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
 
-		if(device->show_samples() || (is_cpu && is_last_tile)) {
+		if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
 			/* Some devices automatically support showing the sample number:
 			 * - CUDADevice
 			 * - OpenCLDevice when using the megakernel (the split kernel renders multiple
@@ -852,6 +903,9 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 */
 			substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
 		}
+		if(params.use_denoising) {
+			substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
+		}
 	}
 	else if(tile_manager.num_samples == INT_MAX)
 		substatus = string_printf("Path Tracing Sample %d", progressive_sample+1);
@@ -865,6 +919,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	}
 	else if(show_done) {
 		status = "Done";
+		progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */
 	}
 	else {
 		status = substatus;
@@ -874,19 +929,39 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	progress.set_status(status, substatus);
 }
 
-void Session::path_trace()
+void Session::render()
 {
-	/* add path trace task */
-	DeviceTask task(DeviceTask::PATH_TRACE);
+	/* Clear buffers. */
+	if(buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
+		buffers->zero();
+	}
+
+	/* Add path trace task. */
+	DeviceTask task(DeviceTask::RENDER);
 	
 	task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
 	task.release_tile = function_bind(&Session::release_tile, this, _1);
+	task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
+	task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
 	task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
 	task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
 	task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
+	task.passes_size = tile_manager.params.get_passes_size();
+
+	if(params.use_denoising) {
+		task.denoising_radius = params.denoising_radius;
+		task.denoising_strength = params.denoising_strength;
+		task.denoising_feature_strength = params.denoising_feature_strength;
+		task.denoising_relative_pca = params.denoising_relative_pca;
+
+		assert(!scene->film->need_update);
+		task.pass_stride = scene->film->pass_stride;
+		task.pass_denoising_data = scene->film->denoising_data_offset;
+		task.pass_denoising_clean = scene->film->denoising_clean_offset;
+	}
 
 	device->task_add(task);
 }
@@ -931,10 +1006,18 @@ bool Session::update_progressive_refine(bool cancel)
 	}
 
 	if(params.progressive_refine) {
-		foreach(RenderBuffers *buffers, tile_buffers) {
+		foreach(Tile& tile, tile_manager.state.tiles) {
+			if(!tile.buffers) {
+				continue;
+			}
+
 			RenderTile rtile;
-			rtile.buffers = buffers;
+			rtile.x = tile_manager.state.buffer.full_x + tile.x;
+			rtile.y = tile_manager.state.buffer.full_y + tile.y;
+			rtile.w = tile.w;
+			rtile.h = tile.h;
 			rtile.sample = sample;
+			rtile.buffers = tile.buffers;
 
 			if(write) {
 				if(write_render_tile_cb)
@@ -942,7 +1025,7 @@ bool Session::update_progressive_refine(bool cancel)
 			}
 			else {
 				if(update_render_tile_cb)
-					update_render_tile_cb(rtile);
+					update_render_tile_cb(rtile, true);
 			}
 		}
 	}
@@ -956,10 +1039,7 @@ void Session::device_free()
 {
 	scene->device_free();
 
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
-
-	tile_buffers.clear();
+	tile_manager.device_free();
 
 	/* used from background render only, so no need to
 	 * re-create render/display buffers here
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c7ff1446171..e63cad0d977 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -17,15 +17,15 @@
 #ifndef __SESSION_H__
 #define __SESSION_H__
 
-#include "buffers.h"
-#include "device.h"
-#include "shader.h"
-#include "tile.h"
+#include "render/buffers.h"
+#include "device/device.h"
+#include "render/shader.h"
+#include "render/tile.h"
 
-#include "util_progress.h"
-#include "util_stats.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_progress.h"
+#include "util/util_stats.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -45,7 +45,6 @@ public:
 	DeviceInfo device;
 	bool background;
 	bool progressive_refine;
-	string output_path;
 
 	bool progressive;
 	bool experimental;
@@ -53,10 +52,17 @@ public:
 	int2 tile_size;
 	TileOrder tile_order;
 	int start_resolution;
+	int pixel_size;
 	int threads;
 
 	bool display_buffer_linear;
 
+	bool use_denoising;
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+
 	double cancel_timeout;
 	double reset_timeout;
 	double text_timeout;
@@ -64,19 +70,30 @@ public:
 
 	ShadingSystem shadingsystem;
 
+	function<bool(const uchar *pixels,
+	              int width,
+	              int height,
+	              int channels)> write_render_cb;
+
 	SessionParams()
 	{
 		background = false;
 		progressive_refine = false;
-		output_path = "";
 
 		progressive = false;
 		experimental = false;
 		samples = INT_MAX;
 		tile_size = make_int2(64, 64);
 		start_resolution = INT_MAX;
+		pixel_size = 1;
 		threads = 0;
 
+		use_denoising = false;
+		denoising_radius = 8;
+		denoising_strength = 0.0f;
+		denoising_feature_strength = 0.0f;
+		denoising_relative_pca = false;
+
 		display_buffer_linear = false;
 
 		cancel_timeout = 0.1;
@@ -92,12 +109,12 @@ public:
 	{ return !(device == params.device
 		&& background == params.background
 		&& progressive_refine == params.progressive_refine
-		&& output_path == params.output_path
 		/* && samples == params.samples */
 		&& progressive == params.progressive
 		&& experimental == params.experimental
 		&& tile_size == params.tile_size
 		&& start_resolution == params.start_resolution
+		&& pixel_size == params.pixel_size
 		&& threads == params.threads
 		&& display_buffer_linear == params.display_buffer_linear
 		&& cancel_timeout == params.cancel_timeout
@@ -126,7 +143,7 @@ public:
 	Stats stats;
 
 	function<void(RenderTile&)> write_render_tile_cb;
-	function<void(RenderTile&)> update_render_tile_cb;
+	function<void(RenderTile&, bool)> update_render_tile_cb;
 
 	explicit Session(const SessionParams& params);
 	~Session();
@@ -141,7 +158,7 @@ public:
 	void set_pause(bool pause);
 
 	void update_scene();
-	void load_kernels();
+	void load_kernels(bool lock_scene=true);
 
 	void device_free();
 
@@ -162,7 +179,7 @@ protected:
 	void update_status_time(bool show_pause = false, bool show_done = false);
 
 	void tonemap(int sample);
-	void path_trace();
+	void render();
 	void reset_(BufferParams& params, int samples);
 
 	void run_cpu();
@@ -177,6 +194,9 @@ protected:
 	void update_tile_sample(RenderTile& tile);
 	void release_tile(RenderTile& tile);
 
+	void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+	void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+
 	bool device_use_gl;
 
 	thread *session_thread;
@@ -195,6 +215,7 @@ protected:
 	thread_mutex display_mutex;
 
 	bool kernels_loaded;
+	DeviceRequestedFeatures loaded_kernel_features;
 
 	double reset_time;
 
@@ -202,8 +223,6 @@ protected:
 	double last_update_time;
 	bool update_progressive_refine(bool cancel);
 
-	vector<RenderBuffers *> tile_buffers;
-
 	DeviceRequestedFeatures get_requested_device_features();
 
 	/* ** Split kernel routines ** */
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 335edcbe609..ec52c51e337 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -14,26 +14,28 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
+thread_mutex ShaderManager::lookup_table_mutex;
 vector<float> ShaderManager::beckmann_table;
+bool ShaderManager::beckmann_table_ready = false;
 
 /* Beckmann sampling precomputed table, see bsdf_microfacet.h */
 
@@ -49,6 +51,16 @@ static float beckmann_table_slope_max()
 	return 6.0;
 }
 
+
+/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86
+ * see T50176 for details
+ */
+#if defined(_MSC_VER) && (_MSC_VER == 1900)
+#  define MSVC_VOLATILE volatile
+#else
+#  define MSVC_VOLATILE
+#endif
+
 /* Paper used: Importance Sampling Microfacet-Based BSDFs with the
  * Distribution of Visible Normals. Supplemental Material 2/2.
  *
@@ -72,7 +84,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 		slope_x[0] = (double)-beckmann_table_slope_max();
 		CDF_P22_omega_i[0] = 0;
 
-		for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
+		for(MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
 			/* slope_x */
 			slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f));
 
@@ -116,6 +128,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 	}
 }
 
+#undef MSVC_VOLATILE
+
 static void beckmann_table_build(vector<float>& table)
 {
 	table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE);
@@ -165,7 +179,6 @@ Shader::Shader()
 	pass_id = 0;
 
 	graph = NULL;
-	graph_bump = NULL;
 
 	has_surface = false;
 	has_surface_transparent = false;
@@ -173,11 +186,14 @@ Shader::Shader()
 	has_surface_bssrdf = false;
 	has_volume = false;
 	has_displacement = false;
+	has_bump = false;
 	has_bssrdf_bump = false;
 	has_surface_spatial_varying = false;
 	has_volume_spatial_varying = false;
 	has_object_dependency = false;
+	has_attribute_dependency = false;
 	has_integrator_dependency = false;
+	has_volume_connected = false;
 
 	displacement_method = DISPLACE_BUMP;
 
@@ -185,13 +201,12 @@ Shader::Shader()
 	used = false;
 
 	need_update = true;
-	need_update_attributes = true;
+	need_update_mesh = true;
 }
 
 Shader::~Shader()
 {
 	delete graph;
-	delete graph_bump;
 }
 
 bool Shader::is_constant_emission(float3 *emission)
@@ -221,14 +236,31 @@ void Shader::set_graph(ShaderGraph *graph_)
 	/* do this here already so that we can detect if mesh or object attributes
 	 * are needed, since the node attribute callbacks check if their sockets
 	 * are connected but proxy nodes should not count */
-	if(graph_)
+	if(graph_) {
 		graph_->remove_proxy_nodes();
 
+		if(displacement_method != DISPLACE_BUMP) {
+			graph_->compute_displacement_hash();
+		}
+	}
+
+	/* update geometry if displacement changed */
+	if(displacement_method != DISPLACE_BUMP) {
+		const char *old_hash = (graph)? graph->displacement_hash.c_str() : "";
+		const char *new_hash = (graph_)? graph_->displacement_hash.c_str() : "";
+
+		if(strcmp(old_hash, new_hash) != 0) {
+			need_update_mesh = true;
+		}
+	}
+
 	/* assign graph */
 	delete graph;
-	delete graph_bump;
 	graph = graph_;
-	graph_bump = NULL;
+
+	/* Store info here before graph optimization to make sure that
+	 * nodes that get optimized away still count. */
+	has_volume_connected = (graph->output()->input("Volume")->link != NULL);
 }
 
 void Shader::tag_update(Scene *scene)
@@ -278,9 +310,9 @@ void Shader::tag_update(Scene *scene)
 	}
 	
 	/* compare if the attributes changed, mesh manager will check
-	 * need_update_attributes, update the relevant meshes and clear it. */
+	 * need_update_mesh, update the relevant meshes and clear it. */
 	if(attributes.modified(prev_attributes)) {
-		need_update_attributes = true;
+		need_update_mesh = true;
 		scene->mesh_manager->need_update = true;
 	}
 
@@ -319,11 +351,14 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 	(void)shadingsystem;  /* Ignored when built without OSL. */
 
 #ifdef WITH_OSL
-	if(shadingsystem == SHADINGSYSTEM_OSL)
+	if(shadingsystem == SHADINGSYSTEM_OSL) {
 		manager = new OSLShaderManager();
+	}
 	else
 #endif
+	{
 		manager = new SVMShaderManager();
+	}
 	
 	add_default(scene);
 
@@ -332,6 +367,8 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 
 uint ShaderManager::get_attribute_id(ustring name)
 {
+	thread_scoped_spin_lock lock(attribute_lock_);
+
 	/* get a unique id for each name, for SVM attribute lookup */
 	AttributeIDMap::iterator it = unique_attribute_id.find(name);
 
@@ -395,15 +432,12 @@ void ShaderManager::device_update_common(Device *device,
                                          Scene *scene,
                                          Progress& /*progress*/)
 {
-	device->tex_free(dscene->shader_flag);
-	dscene->shader_flag.clear();
+	dscene->shaders.free();
 
 	if(scene->shaders.size() == 0)
 		return;
 
-	uint shader_flag_size = scene->shaders.size()*SHADER_SIZE;
-	uint *shader_flag = dscene->shader_flag.resize(shader_flag_size);
-	uint i = 0;
+	KernelShader *kshader = dscene->shaders.alloc(scene->shaders.size());
 	bool has_volumes = false;
 	bool has_transparent_shadow = false;
 
@@ -418,60 +452,61 @@ void ShaderManager::device_update_common(Device *device,
 			flag |= SD_HAS_VOLUME;
 			has_volumes = true;
 
-			/* in this case we can assume transparent surface */
-			if(!shader->has_surface)
-				flag |= SD_HAS_ONLY_VOLUME;
-
 			/* todo: this could check more fine grained, to skip useless volumes
 			 * enclosed inside an opaque bsdf.
 			 */
 			flag |= SD_HAS_TRANSPARENT_SHADOW;
 		}
+		/* in this case we can assume transparent surface */
+		if(shader->has_volume_connected && !shader->has_surface)
+			flag |= SD_HAS_ONLY_VOLUME;
 		if(shader->heterogeneous_volume && shader->has_volume_spatial_varying)
 			flag |= SD_HETEROGENEOUS_VOLUME;
+		if(shader->has_attribute_dependency)
+			flag |= SD_NEED_ATTRIBUTES;
 		if(shader->has_bssrdf_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
-		if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
-			flag |= SD_VOLUME_EQUIANGULAR;
-		if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
-			flag |= SD_VOLUME_MIS;
+		if(device->info.has_volume_decoupled) {
+			if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
+				flag |= SD_VOLUME_EQUIANGULAR;
+			if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+				flag |= SD_VOLUME_MIS;
+		}
 		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
 			flag |= SD_VOLUME_CUBIC;
-		if(shader->graph_bump)
+		if(shader->has_bump)
 			flag |= SD_HAS_BUMP;
 		if(shader->displacement_method != DISPLACE_BUMP)
 			flag |= SD_HAS_DISPLACEMENT;
 
-		/* shader with bump mapping */
-		if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump)
-			flag |= SD_HAS_BSSRDF_BUMP;
-
 		/* constant emission check */
 		float3 constant_emission = make_float3(0.0f, 0.0f, 0.0f);
 		if(shader->is_constant_emission(&constant_emission))
 			flag |= SD_HAS_CONSTANT_EMISSION;
 
 		/* regular shader */
-		shader_flag[i++] = flag;
-		shader_flag[i++] = shader->pass_id;
-		shader_flag[i++] = __float_as_int(constant_emission.x);
-		shader_flag[i++] = __float_as_int(constant_emission.y);
-		shader_flag[i++] = __float_as_int(constant_emission.z);
+		kshader->flags = flag;
+		kshader->pass_id = shader->pass_id;
+		kshader->constant_emission[0] = constant_emission.x;
+		kshader->constant_emission[1] = constant_emission.y;
+		kshader->constant_emission[2] = constant_emission.z;
+		kshader++;
 
 		has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
 	}
 
-	device->tex_alloc("__shader_flag", dscene->shader_flag);
+	dscene->shaders.copy_to_device();
 
 	/* lookup tables */
 	KernelTables *ktables = &dscene->data.tables;
 
 	/* beckmann lookup table */
 	if(beckmann_table_offset == TABLE_OFFSET_INVALID) {
-		if(beckmann_table.size() == 0) {
+		if(!beckmann_table_ready) {
 			thread_scoped_lock lock(lookup_table_mutex);
-			if(beckmann_table.size() == 0) {
+			if(!beckmann_table_ready) {
 				beckmann_table_build(beckmann_table);
+				beckmann_table_ready = true;
 			}
 		}
 		beckmann_table_offset = scene->lookup_tables->add_table(dscene, beckmann_table);
@@ -482,17 +517,14 @@ void ShaderManager::device_update_common(Device *device,
 	KernelIntegrator *kintegrator = &dscene->data.integrator;
 	kintegrator->use_volumes = has_volumes;
 	/* TODO(sergey): De-duplicate with flags set in integrator.cpp. */
-	if(scene->integrator->transparent_shadows) {
-		kintegrator->transparent_shadows = has_transparent_shadow;
-	}
+	kintegrator->transparent_shadows = has_transparent_shadow;
 }
 
-void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
+void ShaderManager::device_free_common(Device *, DeviceScene *dscene, Scene *scene)
 {
 	scene->lookup_tables->remove_table(&beckmann_table_offset);
 
-	device->tex_free(dscene->shader_flag);
-	dscene->shader_flag.clear();
+	dscene->shaders.free();
 }
 
 void ShaderManager::add_default(Scene *scene)
@@ -567,6 +599,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
 			if(CLOSURE_IS_VOLUME(bsdf_node->closure)) {
 				requested_features->nodes_features |= NODE_FEATURE_VOLUME;
 			}
+			else if(CLOSURE_IS_PRINCIPLED(bsdf_node->closure)) {
+				requested_features->use_principled = true;
+			}
 		}
 		if(node->has_surface_bssrdf()) {
 			requested_features->use_subsurface = true;
@@ -574,6 +609,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
 		if(node->has_surface_transparent()) {
 			requested_features->use_transparent = true;
 		}
+		if(node->has_raytrace()) {
+			requested_features->use_shader_raytrace = true;
+		}
 	}
 }
 
@@ -586,15 +624,10 @@ void ShaderManager::get_requested_features(Scene *scene,
 		Shader *shader = scene->shaders[i];
 		/* Gather requested features from all the nodes from the graph nodes. */
 		get_requested_graph_features(shader->graph, requested_features);
-		/* Gather requested features from the graph itself. */
-		if(shader->graph_bump) {
-			get_requested_graph_features(shader->graph_bump,
-			                             requested_features);
-		}
 		ShaderNode *output_node = shader->graph->output();
 		if(output_node->input("Displacement")->link != NULL) {
 			requested_features->nodes_features |= NODE_FEATURE_BUMP;
-			if(shader->displacement_method == DISPLACE_BOTH && requested_features->experimental) {
+			if(shader->displacement_method == DISPLACE_BOTH) {
 				requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
 			}
 		}
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 7d896652196..abd483caabc 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -19,20 +19,20 @@
 
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "attribute.h"
-#include "kernel_types.h"
+#include "render/attribute.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,18 +82,13 @@ enum DisplacementMethod {
 
 class Shader : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int pass_id;
 
 	/* shader graph */
 	ShaderGraph *graph;
 
-	/* shader graph with auto bump mapping included, we compile two shaders,
-	 * with and without bump,  because the displacement method is a mesh
-	 * level setting, so we need to handle both */
-	ShaderGraph *graph_bump;
-
 	/* sampling */
 	bool use_mis;
 	bool use_transparent_shadow;
@@ -103,7 +98,16 @@ public:
 
 	/* synchronization */
 	bool need_update;
-	bool need_update_attributes;
+	bool need_update_mesh;
+
+	/* If the shader has only volume components, the surface is assumed to
+	 * be transparent.
+	 * However, graph optimization might remove the volume subgraph, but
+	 * since the user connected something to the volume output the surface
+	 * should still be transparent.
+	 * Therefore, has_volume_connected stores whether some volume subtree
+	 * was connected before optimization. */
+	bool has_volume_connected;
 
 	/* information about shader after compiling */
 	bool has_surface;
@@ -112,10 +116,12 @@ public:
 	bool has_volume;
 	bool has_displacement;
 	bool has_surface_bssrdf;
+	bool has_bump;
 	bool has_bssrdf_bump;
 	bool has_surface_spatial_varying;
 	bool has_volume_spatial_varying;
 	bool has_object_dependency;
+	bool has_attribute_dependency;
 	bool has_integrator_dependency;
 
 	/* displacement */
@@ -195,13 +201,16 @@ protected:
 	typedef unordered_map<ustring, uint, ustringHash> AttributeIDMap;
 	AttributeIDMap unique_attribute_id;
 
-	thread_mutex lookup_table_mutex;
+	static thread_mutex lookup_table_mutex;
 	static vector<float> beckmann_table;
+	static bool beckmann_table_ready;
 
 	size_t beckmann_table_offset;
 
 	void get_requested_graph_features(ShaderGraph *graph,
 	                                  DeviceRequestedFeatures *requested_features);
+
+	thread_spin_lock attribute_lock_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp
index e3c2e802067..6906667ac19 100644
--- a/intern/cycles/render/sobol.cpp
+++ b/intern/cycles/render/sobol.cpp
@@ -46,10 +46,9 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_types.h"
 
-#include "sobol.h"
+#include "render/sobol.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h
index 574f148b9a2..9fbce4e14a5 100644
--- a/intern/cycles/render/sobol.h
+++ b/intern/cycles/render/sobol.h
@@ -17,7 +17,7 @@
 #ifndef __SOBOL_H__
 #define __SOBOL_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 955b892f4c3..c5b4060d5c3 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_task.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+
+#include "util/util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,15 +47,15 @@ void SVMShaderManager::reset(Scene * /*scene*/)
 void SVMShaderManager::device_update_shader(Scene *scene,
                                             Shader *shader,
                                             Progress *progress,
-                                            vector<int4> *global_svm_nodes)
+                                            array<int4> *global_svm_nodes)
 {
 	if(progress->get_cancel()) {
 		return;
 	}
 	assert(shader->graph);
 
-	vector<int4> svm_nodes;
-	svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
+	array<int4> svm_nodes;
+	svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 
 	SVMCompiler::Summary summary;
 	SVMCompiler compiler(scene->shader_manager, scene->image_manager);
@@ -67,6 +66,7 @@ void SVMShaderManager::device_update_shader(Scene *scene,
 	        << "Shader name: " << shader->name << "\n"
 	        << summary.full_report();
 
+	nodes_lock_.lock();
 	if(shader->use_mis && shader->has_surface_emission) {
 		scene->light_manager->need_update = true;
 	}
@@ -74,17 +74,16 @@ void SVMShaderManager::device_update_shader(Scene *scene,
 	/* The copy needs to be done inside the lock, if another thread resizes the array 
 	 * while memcpy is running, it'll be copying into possibly invalid/freed ram. 
 	 */
-	nodes_lock_.lock();
 	size_t global_nodes_size = global_svm_nodes->size();
 	global_svm_nodes->resize(global_nodes_size + svm_nodes.size());
 	
 	/* Offset local SVM nodes to a global address space. */
-	int4& jump_node = global_svm_nodes->at(shader->id);
+	int4& jump_node = (*global_svm_nodes)[shader->id];
 	jump_node.y = svm_nodes[0].y + global_nodes_size - 1;
 	jump_node.z = svm_nodes[0].z + global_nodes_size - 1;
 	jump_node.w = svm_nodes[0].w + global_nodes_size - 1;
 	/* Copy new nodes to global storage. */
-	memcpy(&global_svm_nodes->at(global_nodes_size),
+	memcpy(&(*global_svm_nodes)[global_nodes_size],
 	       &svm_nodes[1],
 	       sizeof(int4) * (svm_nodes.size() - 1));
 	nodes_lock_.unlock();
@@ -106,11 +105,11 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 	device_update_shaders_used(scene);
 
 	/* svm_nodes */
-	vector<int4> svm_nodes;
+	array<int4> svm_nodes;
 	size_t i;
 
 	for(i = 0; i < scene->shaders.size(); i++) {
-		svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
+		svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 	}
 
 	TaskPool task_pool;
@@ -129,8 +128,8 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 		return;
 	}
 
-	dscene->svm_nodes.copy((uint4*)&svm_nodes[0], svm_nodes.size());
-	device->tex_alloc("__svm_nodes", dscene->svm_nodes);
+	dscene->svm_nodes.steal_data(svm_nodes);
+	dscene->svm_nodes.copy_to_device();
 
 	for(i = 0; i < scene->shaders.size(); i++) {
 		Shader *shader = scene->shaders[i];
@@ -150,8 +149,7 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 {
 	device_free_common(device, dscene, scene);
 
-	device->tex_free(dscene->svm_nodes);
-	dscene->svm_nodes.clear();
+	dscene->svm_nodes.free();
 }
 
 /* Graph Compiler */
@@ -366,17 +364,17 @@ uint SVMCompiler::encode_uchar4(uint x, uint y, uint z, uint w)
 
 void SVMCompiler::add_node(int a, int b, int c, int d)
 {
-	current_svm_nodes.push_back(make_int4(a, b, c, d));
+	current_svm_nodes.push_back_slow(make_int4(a, b, c, d));
 }
 
 void SVMCompiler::add_node(ShaderNodeType type, int a, int b, int c)
 {
-	current_svm_nodes.push_back(make_int4(type, a, b, c));
+	current_svm_nodes.push_back_slow(make_int4(type, a, b, c));
 }
 
 void SVMCompiler::add_node(ShaderNodeType type, const float3& f)
 {
-	current_svm_nodes.push_back(make_int4(type,
+	current_svm_nodes.push_back_slow(make_int4(type,
 		__float_as_int(f.x),
 		__float_as_int(f.y),
 		__float_as_int(f.z)));
@@ -384,7 +382,7 @@ void SVMCompiler::add_node(ShaderNodeType type, const float3& f)
 
 void SVMCompiler::add_node(const float4& f)
 {
-	current_svm_nodes.push_back(make_int4(
+	current_svm_nodes.push_back_slow(make_int4(
 		__float_as_int(f.x),
 		__float_as_int(f.y),
 		__float_as_int(f.z),
@@ -401,6 +399,12 @@ uint SVMCompiler::attribute(AttributeStandard std)
 	return shader_manager->get_attribute_id(std);
 }
 
+uint SVMCompiler::attribute_standard(ustring name)
+{
+	AttributeStandard std = Attribute::name_standard(name.c_str());
+	return (std)? attribute(std): attribute(name);
+}
+
 bool SVMCompiler::node_skip_input(ShaderNode * /*node*/, ShaderInput *input)
 {
 	/* nasty exception .. */
@@ -449,6 +453,10 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet& done)
 		current_shader->has_object_dependency = true;
 	}
 
+	if(node->has_attribute_dependency()) {
+		current_shader->has_attribute_dependency = true;
+	}
+
 	if(node->has_integrator_dependency()) {
 		current_shader->has_integrator_dependency = true;
 	}
@@ -521,6 +529,9 @@ void SVMCompiler::generate_closure_node(ShaderNode *node,
 			if(node->has_bssrdf_bump())
 				current_shader->has_bssrdf_bump = true;
 		}
+		if(node->has_bump()) {
+			current_shader->has_bump = true;
+		}
 	}
 }
 
@@ -624,7 +635,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
 				/* Add instruction to skip closure and its dependencies if mix
 				 * weight is zero.
 				 */
-				current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE,
+				current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ONE,
 				                                      0,
 				                                      stack_assign(facin),
 				                                      0));
@@ -642,7 +653,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
 				/* Add instruction to skip closure and its dependencies if mix
 				 * weight is zero.
 				 */
-				current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO,
+				current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ZERO,
 				                                      0,
 				                                      stack_assign(facin),
 				                                      0));
@@ -794,36 +805,26 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
 
 void SVMCompiler::compile(Scene *scene,
                           Shader *shader,
-                          vector<int4>& svm_nodes,
+                          array<int4>& svm_nodes,
                           int index,
                           Summary *summary)
 {
 	/* copy graph for shader with bump mapping */
-	ShaderNode *node = shader->graph->output();
+	ShaderNode *output = shader->graph->output();
 	int start_num_svm_nodes = svm_nodes.size();
 
 	const double time_start = time_dt();
 
-	if(node->input("Surface")->link && node->input("Displacement")->link)
-		if(!shader->graph_bump)
-			shader->graph_bump = shader->graph->copy();
+	bool has_bump = (shader->displacement_method != DISPLACE_TRUE) &&
+	                output->input("Surface")->link && output->input("Displacement")->link;
 
 	/* finalize */
 	{
 		scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL);
 		shader->graph->finalize(scene,
-		                        false,
-		                        false,
-		                        shader->has_integrator_dependency);
-	}
-
-	if(shader->graph_bump) {
-		scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL);
-		shader->graph_bump->finalize(scene,
-		                             true,
-		                             false,
-		                             shader->has_integrator_dependency,
-		                             shader->displacement_method == DISPLACE_BOTH);
+		                        has_bump,
+		                        shader->has_integrator_dependency,
+		                        shader->displacement_method == DISPLACE_BOTH);
 	}
 
 	current_shader = shader;
@@ -832,22 +833,22 @@ void SVMCompiler::compile(Scene *scene,
 	shader->has_surface_emission = false;
 	shader->has_surface_transparent = false;
 	shader->has_surface_bssrdf = false;
-	shader->has_bssrdf_bump = false;
+	shader->has_bump = has_bump;
+	shader->has_bssrdf_bump = has_bump;
 	shader->has_volume = false;
 	shader->has_displacement = false;
 	shader->has_surface_spatial_varying = false;
 	shader->has_volume_spatial_varying = false;
 	shader->has_object_dependency = false;
+	shader->has_attribute_dependency = false;
 	shader->has_integrator_dependency = false;
 
 	/* generate bump shader */
-	if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) {
+	if(has_bump) {
 		scoped_timer timer((summary != NULL)? &summary->time_generate_bump: NULL);
-		compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP);
+		compile_type(shader, shader->graph, SHADER_TYPE_BUMP);
 		svm_nodes[index].y = svm_nodes.size();
-		svm_nodes.insert(svm_nodes.end(),
-		                 current_svm_nodes.begin(),
-		                 current_svm_nodes.end());
+		svm_nodes.append(current_svm_nodes);
 	}
 
 	/* generate surface shader */
@@ -855,12 +856,10 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_generate_surface: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
 		/* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */
-		if(shader->displacement_method == DISPLACE_TRUE || !shader->graph_bump) {
+		if(!has_bump) {
 			svm_nodes[index].y = svm_nodes.size();
 		}
-		svm_nodes.insert(svm_nodes.end(),
-		                 current_svm_nodes.begin(),
-		                 current_svm_nodes.end());
+		svm_nodes.append(current_svm_nodes);
 	}
 
 	/* generate volume shader */
@@ -868,9 +867,7 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_generate_volume: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_VOLUME);
 		svm_nodes[index].z = svm_nodes.size();
-		svm_nodes.insert(svm_nodes.end(),
-		                 current_svm_nodes.begin(),
-		                 current_svm_nodes.end());
+		svm_nodes.append(current_svm_nodes);
 	}
 
 	/* generate displacement shader */
@@ -878,9 +875,7 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_generate_displacement: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_DISPLACEMENT);
 		svm_nodes[index].w = svm_nodes.size();
-		svm_nodes.insert(svm_nodes.end(),
-		                 current_svm_nodes.begin(),
-		                 current_svm_nodes.end());
+		svm_nodes.append(current_svm_nodes);
 	}
 
 	/* Fill in summary information. */
@@ -897,7 +892,6 @@ SVMCompiler::Summary::Summary()
 	: num_svm_nodes(0),
 	  peak_stack_usage(0),
 	  time_finalize(0.0),
-	  time_finalize_bump(0.0),
 	  time_generate_surface(0.0),
 	  time_generate_bump(0.0),
 	  time_generate_volume(0.0),
@@ -913,10 +907,7 @@ string SVMCompiler::Summary::full_report() const
 	report += string_printf("Peak stack usage:    %d\n", peak_stack_usage);
 
 	report += string_printf("Time (in seconds):\n");
-	report += string_printf("  Finalize:          %f\n", time_finalize);
-	report += string_printf("  Bump finalize:     %f\n", time_finalize_bump);
-	report += string_printf("Finalize:            %f\n", time_finalize +
-	                                                     time_finalize_bump);
+	report += string_printf("Finalize:            %f\n", time_finalize);
 	report += string_printf("  Surface:           %f\n", time_generate_surface);
 	report += string_printf("  Bump:              %f\n", time_generate_bump);
 	report += string_printf("  Volume:            %f\n", time_generate_volume);
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index a501b6bc8b1..18be0fa9a22 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -17,13 +17,13 @@
 #ifndef __SVM_H__
 #define __SVM_H__
 
-#include "attribute.h"
-#include "graph.h"
-#include "shader.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/shader.h"
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -55,7 +55,7 @@ protected:
 	void device_update_shader(Scene *scene,
 	                          Shader *shader,
 	                          Progress *progress,
-	                          vector<int4> *global_svm_nodes);
+	                          array<int4> *global_svm_nodes);
 };
 
 /* Graph Compiler */
@@ -74,9 +74,6 @@ public:
 		/* Time spent on surface graph finalization. */
 		double time_finalize;
 
-		/* Time spent on bump graph finalization. */
-		double time_finalize_bump;
-
 		/* Time spent on generating SVM nodes for surface shader. */
 		double time_generate_surface;
 
@@ -101,7 +98,7 @@ public:
 	SVMCompiler(ShaderManager *shader_manager, ImageManager *image_manager);
 	void compile(Scene *scene,
 	             Shader *shader,
-	             vector<int4>& svm_nodes,
+	             array<int4>& svm_nodes,
 	             int index,
 	             Summary *summary = NULL);
 
@@ -120,6 +117,7 @@ public:
 	void add_node(const float4& f);
 	uint attribute(ustring name);
 	uint attribute(AttributeStandard std);
+	uint attribute_standard(ustring name);
 	uint encode_uchar4(uint x, uint y = 0, uint z = 0, uint w = 0);
 	uint closure_mix_weight_offset() { return mix_weight_offset; }
 
@@ -210,7 +208,7 @@ protected:
 	/* compile */
 	void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type);
 
-	vector<int4> current_svm_nodes;
+	array<int4> current_svm_nodes;
 	ShaderType current_type;
 	Shader *current_shader;
 	ShaderGraph *current_graph;
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index dfafd99961b..536fdd0775e 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "scene.h"
-#include "tables.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/tables.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -35,25 +34,22 @@ LookupTables::~LookupTables()
 	assert(lookup_tables.size() == 0);
 }
 
-void LookupTables::device_update(Device *device, DeviceScene *dscene)
+void LookupTables::device_update(Device *, DeviceScene *dscene)
 {
 	if(!need_update)
 		return;
 
 	VLOG(1) << "Total " << lookup_tables.size() << " lookup tables.";
 
-	device->tex_free(dscene->lookup_table);
-
 	if(lookup_tables.size() > 0)
-		device->tex_alloc("__lookup_table", dscene->lookup_table);
+		dscene->lookup_table.copy_to_device();
 
 	need_update = false;
 }
 
-void LookupTables::device_free(Device *device, DeviceScene *dscene)
+void LookupTables::device_free(Device *, DeviceScene *dscene)
 {
-	device->tex_free(dscene->lookup_table);
-	dscene->lookup_table.clear();
+	dscene->lookup_table.free();
 }
 
 static size_t round_up_to_multiple(size_t size, size_t chunk)
@@ -90,7 +86,9 @@ size_t LookupTables::add_table(DeviceScene *dscene, vector<float>& data)
 	}
 
 	/* copy table data and return offset */
-	dscene->lookup_table.copy_at(&data[0], new_table.offset, data.size());
+	float *dtable = dscene->lookup_table.data();
+	memcpy(dtable + new_table.offset, &data[0], sizeof(float) * data.size());
+
 	return new_table.offset;
 }
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 1bb70b22762..bc261c2a74d 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -17,7 +17,7 @@
 #ifndef __TABLES_H__
 #define __TABLES_H__
 
-#include <util_list.h>
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index a493c3fa1cd..a388f5dfc8b 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include "tile.h"
+#include "render/tile.h"
 
-#include "util_algorithm.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -25,37 +26,39 @@ namespace {
 
 class TileComparator {
 public:
-	TileComparator(TileOrder order, int2 center)
-	 :  order_(order),
-	    center_(center)
+	TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
+	 :  order(order_),
+	    center(center_),
+	    tiles(tiles_)
 	{}
 
-	bool operator()(Tile &a, Tile &b)
+	bool operator()(int a, int b)
 	{
-		switch(order_) {
+		switch(order) {
 			case TILE_CENTER:
 			{
-				float2 dist_a = make_float2(center_.x - (a.x + a.w/2),
-				                            center_.y - (a.y + a.h/2));
-				float2 dist_b = make_float2(center_.x - (b.x + b.w/2),
-				                            center_.y - (b.y + b.h/2));
+				float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2),
+				                            center.y - (tiles[a].y + tiles[a].h/2));
+				float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2),
+				                            center.y - (tiles[b].y + tiles[b].h/2));
 				return dot(dist_a, dist_a) < dot(dist_b, dist_b);
 			}
 			case TILE_LEFT_TO_RIGHT:
-				return (a.x == b.x)? (a.y < b.y): (a.x < b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x);
 			case TILE_RIGHT_TO_LEFT:
-				return (a.x == b.x)? (a.y < b.y): (a.x > b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x);
 			case TILE_TOP_TO_BOTTOM:
-				return (a.y == b.y)? (a.x < b.x): (a.y > b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y);
 			case TILE_BOTTOM_TO_TOP:
 			default:
-				return (a.y == b.y)? (a.x < b.x): (a.y < b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y);
 		}
 	}
 
 protected:
-	TileOrder order_;
-	int2 center_;
+	TileOrder order;
+	int2 center;
+	Tile *tiles;
 };
 
 inline int2 hilbert_index_to_pos(int n, int d)
@@ -86,16 +89,19 @@ enum SpiralDirection {
 }  /* namespace */
 
 TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, int start_resolution_,
-                         bool preserve_tile_device_, bool background_, TileOrder tile_order_, int num_devices_)
+                         bool preserve_tile_device_, bool background_, TileOrder tile_order_,
+                         int num_devices_, int pixel_size_)
 {
 	progressive = progressive_;
 	tile_size = tile_size_;
 	tile_order = tile_order_;
 	start_resolution = start_resolution_;
+	pixel_size = pixel_size_;
 	num_samples = num_samples_;
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
 	background = background_;
+	schedule_denoising = false;
 
 	range_start_sample = 0;
 	range_num_samples = -1;
@@ -108,6 +114,18 @@ TileManager::~TileManager()
 {
 }
 
+void TileManager::device_free()
+{
+	if(schedule_denoising || progressive) {
+		for(int i = 0; i < state.tiles.size(); i++) {
+			delete state.tiles[i].buffers;
+			state.tiles[i].buffers = NULL;
+		}
+	}
+
+	state.tiles.clear();
+}
+
 static int get_divider(int w, int h, int start_resolution)
 {
 	int divider = 1;
@@ -131,10 +149,11 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	state.buffer = BufferParams();
 	state.sample = range_start_sample - 1;
 	state.num_tiles = 0;
-	state.num_rendered_tiles = 0;
 	state.num_samples = 0;
 	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
-	state.tiles.clear();
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
+	device_free();
 }
 
 void TileManager::set_samples(int num_samples_)
@@ -149,15 +168,20 @@ void TileManager::set_samples(int num_samples_)
 		uint64_t pixel_samples = 0;
 		/* While rendering in the viewport, the initial preview resolution is increased to the native resolution
 		 * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */
-		int divider = get_divider(params.width, params.height, start_resolution) / 2;
-		while(divider > 1) {
+		int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
+		while(divider > pixel_size) {
 			int image_w = max(1, params.width/divider);
 			int image_h = max(1, params.height/divider);
 			pixel_samples += image_w * image_h;
 			divider >>= 1;
 		}
 
-		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+		int image_w = max(1, params.width/divider);
+		int image_h = max(1, params.height/divider);
+		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * image_w*image_h;
+		if(schedule_denoising) {
+			state.total_pixel_samples += params.width*params.height;
+		}
 	}
 }
 
@@ -170,32 +194,36 @@ int TileManager::gen_tiles(bool sliced)
 	int image_h = max(1, params.height/resolution);
 	int2 center = make_int2(image_w/2, image_h/2);
 
-	state.tiles.clear();
-
 	int num_logical_devices = preserve_tile_device? num_devices: 1;
 	int num = min(image_h, num_logical_devices);
 	int slice_num = sliced? num: 1;
-	int tile_index = 0;
+	int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
 
-	state.tiles.clear();
-	state.tiles.resize(num);
-	vector<list<Tile> >::iterator tile_list = state.tiles.begin();
+	device_free();
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
+	state.render_tiles.resize(num);
+	state.denoising_tiles.resize(num);
+	state.tile_stride = tile_w;
+	vector<list<int> >::iterator tile_list;
+	tile_list = state.render_tiles.begin();
 
 	if(tile_order == TILE_HILBERT_SPIRAL) {
 		assert(!sliced);
 
+		int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+		state.tiles.resize(tile_w*tile_h);
+
 		/* Size of blocks in tiles, must be a power of 2 */
 		const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y;
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
 		/* Number of blocks to fill the image */
-		int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x;
-		int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y;
+		int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x);
+		int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y);
 		int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
 		/* Offset of spiral (to keep it centered) */
 		int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2);
@@ -226,9 +254,11 @@ int TileManager::gen_tiles(bool sliced)
 				if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
 					int w = min(tile_size.x, image_w - pos.x);
 					int h = min(tile_size.y, image_h - pos.y);
-					tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device));
+					int2 ipos = pos / tile_size;
+					int idx = ipos.y*tile_w + ipos.x;
+					state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
+					tile_list->push_front(idx);
 					cur_tiles++;
-					tile_index++;
 
 					if(cur_tiles == tiles_per_device) {
 						tile_list++;
@@ -272,27 +302,28 @@ int TileManager::gen_tiles(bool sliced)
 					break;
 			}
 		}
-		return tile_index;
+		return tile_w*tile_h;
 	}
 
+	int idx = 0;
 	for(int slice = 0; slice < slice_num; slice++) {
 		int slice_y = (image_h/slice_num)*slice;
 		int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y;
+		int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y);
 
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		for(int tile_y = 0; tile_y < tile_h; tile_y++) {
-			for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) {
+			for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
 				int x = tile_x * tile_size.x;
 				int y = tile_y * tile_size.y;
 				int w = (tile_x == tile_w-1)? image_w - x: tile_size.x;
 				int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y;
 
-				tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device));
+				state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER));
+				tile_list->push_back(idx);
 
 				if(!sliced) {
 					cur_tiles++;
@@ -300,7 +331,7 @@ int TileManager::gen_tiles(bool sliced)
 					if(cur_tiles == tiles_per_device) {
 						/* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
 						if(tile_order != TILE_BOTTOM_TO_TOP) {
-							tile_list->sort(TileComparator(tile_order, center));
+							tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
 						}
 						tile_list++;
 						cur_tiles = 0;
@@ -314,7 +345,15 @@ int TileManager::gen_tiles(bool sliced)
 		}
 	}
 
-	return tile_index;
+	return idx;
+}
+
+void TileManager::gen_render_tiles()
+{
+	/* Regenerate just the render tiles for progressive render. */
+	foreach(Tile& tile, state.tiles) {
+		state.render_tiles[tile.device].push_back(tile.index);
+	}
 }
 
 void TileManager::set_tiles()
@@ -334,16 +373,115 @@ void TileManager::set_tiles()
 	state.buffer.full_height = max(1, params.full_height/resolution);
 }
 
-bool TileManager::next_tile(Tile& tile, int device)
+int TileManager::get_neighbor_index(int index, int neighbor)
+{
+	static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+
+	int resolution = state.resolution_divider;
+	int image_w = max(1, params.width/resolution);
+	int image_h = max(1, params.height/resolution);
+	int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x);
+	int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y);
+
+	int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor];
+	if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+		return -1;
+
+	return ny*state.tile_stride + nx;
+}
+
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+{
+	if(index < 0 || state.tiles[index].state < min_state) {
+		return false;
+	}
+	for(int neighbor = 0; neighbor < 9; neighbor++) {
+		int nindex = get_neighbor_index(index, neighbor);
+		/* Out-of-bounds tiles don't matter. */
+		if(nindex >= 0 && state.tiles[nindex].state < min_state) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
+bool TileManager::finish_tile(int index, bool &delete_tile)
+{
+	delete_tile = false;
+
+	if(progressive) {
+		return true;
+	}
+
+	switch(state.tiles[index].state) {
+		case Tile::RENDER:
+		{
+			if(!schedule_denoising) {
+				state.tiles[index].state = Tile::DONE;
+				delete_tile = true;
+				return true;
+			}
+			state.tiles[index].state = Tile::RENDERED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::RENDERED)) {
+					state.tiles[nindex].state = Tile::DENOISE;
+					state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
+				}
+			}
+			return false;
+		}
+		case Tile::DENOISE:
+		{
+			state.tiles[index].state = Tile::DENOISED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::DENOISED)) {
+					state.tiles[nindex].state = Tile::DONE;
+					/* It can happen that the tile just finished denoising and already can be freed here.
+					 * However, in that case it still has to be written before deleting, so we can't delete it yet. */
+					if(neighbor == 8) {
+						delete_tile = true;
+					}
+					else {
+						delete state.tiles[nindex].buffers;
+						state.tiles[nindex].buffers = NULL;
+					}
+				}
+			}
+			return true;
+		}
+		default:
+			assert(false);
+			return true;
+	}
+}
+
+bool TileManager::next_tile(Tile* &tile, int device)
 {
 	int logical_device = preserve_tile_device? device: 0;
 
-	if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty())
+	if(logical_device >= state.render_tiles.size())
+		return false;
+
+	if(!state.denoising_tiles[logical_device].empty()) {
+		int idx = state.denoising_tiles[logical_device].front();
+		state.denoising_tiles[logical_device].pop_front();
+		tile = &state.tiles[idx];
+		return true;
+	}
+
+	if(state.render_tiles[logical_device].empty())
 		return false;
 
-	tile = Tile(state.tiles[logical_device].front());
-	state.tiles[logical_device].pop_front();
-	state.num_rendered_tiles++;
+	int idx = state.render_tiles[logical_device].front();
+	state.render_tiles[logical_device].pop_front();
+	tile = &state.tiles[idx];
 	return true;
 }
 
@@ -352,7 +490,7 @@ bool TileManager::done()
 	int end_sample = (range_num_samples == -1)
 	                     ? num_samples
 	                     : range_start_sample + range_num_samples;
-	return (state.resolution_divider == 1) &&
+	return (state.resolution_divider == pixel_size) &&
 	       (state.sample+state.num_samples >= end_sample);
 }
 
@@ -361,9 +499,9 @@ bool TileManager::next()
 	if(done())
 		return false;
 
-	if(progressive && state.resolution_divider > 1) {
+	if(progressive && state.resolution_divider > pixel_size) {
 		state.sample = 0;
-		state.resolution_divider /= 2;
+		state.resolution_divider = max(state.resolution_divider/2, pixel_size);
 		state.num_samples = 1;
 		set_tiles();
 	}
@@ -377,8 +515,14 @@ bool TileManager::next()
 		else
 			state.num_samples = range_num_samples;
 
-		state.resolution_divider = 1;
-		set_tiles();
+		state.resolution_divider = pixel_size;
+
+		if(state.sample == range_start_sample) {
+			set_tiles();
+		}
+		else {
+			gen_render_tiles();
+		}
 	}
 
 	return true;
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 5d92ebac355..2692c7cf9f0 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -19,8 +19,8 @@
 
 #include <limits.h>
 
-#include "buffers.h"
-#include "util_list.h"
+#include "render/buffers.h"
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,12 +31,20 @@ public:
 	int index;
 	int x, y, w, h;
 	int device;
+	/* RENDER: The tile has to be rendered.
+	 * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
+	 * DENOISE: The tile can be denoised now.
+	 * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
+	 * DONE: The tile is finished and has been freed. */
+	typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
+	State state;
+	RenderBuffers *buffers;
 
 	Tile()
 	{}
 
-	Tile(int index_, int x_, int y_, int w_, int h_, int device_)
-	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {}
+	Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
+	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {}
 };
 
 /* Tile order */
@@ -58,31 +66,37 @@ public:
 	BufferParams params;
 
 	struct State {
+		vector<Tile> tiles;
+		int tile_stride;
 		BufferParams buffer;
 		int sample;
 		int num_samples;
 		int resolution_divider;
 		int num_tiles;
-		int num_rendered_tiles;
 
 		/* Total samples over all pixels: Generally num_samples*num_pixels,
 		 * but can be higher due to the initial resolution division for previews. */
 		uint64_t total_pixel_samples;
-		/* This vector contains a list of tiles for every logical device in the session.
-		 * In each list, the tiles are sorted according to the tile order setting. */
-		vector<list<Tile> > tiles;
+
+		/* These lists contain the indices of the tiles to be rendered/denoised and are used
+		 * when acquiring a new tile for the device.
+		 * Each list in each vector is for one logical device. */
+		vector<list<int> > render_tiles;
+		vector<list<int> > denoising_tiles;
 	} state;
 
 	int num_samples;
 
 	TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution,
-	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
+	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1, int pixel_size = 1);
 	~TileManager();
 
+	void device_free();
 	void reset(BufferParams& params, int num_samples);
 	void set_samples(int num_samples);
 	bool next();
-	bool next_tile(Tile& tile, int device = 0);
+	bool next_tile(Tile* &tile, int device = 0);
+	bool finish_tile(int index, bool& delete_tile);
 	bool done();
 
 	void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; }
@@ -97,6 +111,9 @@ public:
 
 	/* Get number of actual samples to render. */
 	int get_num_effective_samples();
+
+	/* Schedule tiles for denoising after they've been rendered. */
+	bool schedule_denoising;
 protected:
 
 	void set_tiles();
@@ -105,6 +122,7 @@ protected:
 	int2 tile_size;
 	TileOrder tile_order;
 	int start_resolution;
+	int pixel_size;
 	int num_devices;
 
 	/* in some cases it is important that the same tile will be returned for the same
@@ -128,6 +146,10 @@ protected:
 
 	/* Generate tile list, return number of tiles. */
 	int gen_tiles(bool sliced);
+	void gen_render_tiles();
+
+	int get_neighbor_index(int index, int neighbor);
+	bool check_neighbor_state(int index, Tile::State state);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt
index dafb807bdf3..7f952dd43ce 100644
--- a/intern/cycles/subd/CMakeLists.txt
+++ b/intern/cycles/subd/CMakeLists.txt
@@ -1,11 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
+	..
 )
 
 set(INC_SYS
@@ -29,4 +24,4 @@ set(SRC_HEADERS
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_subd ${SRC} ${SRC_HEADERS})
+cycles_add_library(cycles_subd ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index a1bd349b167..8c426da4acf 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -14,13 +14,11 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
-
-#include "util_debug.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 33d13a4ab3a..c0e32be18c4 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -22,8 +22,8 @@
  * DiagSplit. For more algorithm details, see the DiagSplit paper or the
  * ARB_tessellation_shader OpenGL extension, Section 2.X.2. */
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp
index d3319c5ccf5..fa2fe2bf113 100644
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@@ -16,12 +16,12 @@
 
 /* Parts adapted from code in the public domain in NVidia Mesh Tools. */
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "subd_patch.h"
+#include "subd/subd_patch.h"
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 360c1abf27b..1bb81588835 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_H__
 #define __SUBD_PATCH_H__
 
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
index d437b045c07..63bf673a90b 100644
--- a/intern/cycles/subd/subd_patch_table.cpp
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -25,10 +25,10 @@
  *
  */
 
-#include "subd_patch_table.h"
-#include "kernel_types.h"
+#include "subd/subd_patch_table.h"
+#include "kernel/kernel_types.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #ifdef WITH_OPENSUBDIV
 #include <opensubdiv/far/patchTable.h>
diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h
index 3166a1691d8..907f2dd6c28 100644
--- a/intern/cycles/subd/subd_patch_table.h
+++ b/intern/cycles/subd/subd_patch_table.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_TABLE_H__
 #define __SUBD_PATCH_TABLE_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 #ifdef WITH_OPENSUBDIV
 #ifdef _MSC_VER
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 3c91ad8ab0d..d3bed6a5c53 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index a2f76dd2e03..f869cc6a48e 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -22,10 +22,10 @@
  * evaluation at arbitrary points is required for this to work. See the paper
  * for more details. */
 
-#include "subd_dice.h"
+#include "subd/subd_dice.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 80564c33be6..f3e49dc0c4e 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -22,12 +22,17 @@ set(INC
 )
 
 set(ALL_CYCLES_LIBRARIES
-	cycles_render
 	cycles_device
+	cycles_kernel
+	cycles_render
 	cycles_bvh
 	cycles_graph
 	cycles_subd
 	cycles_util
+	extern_clew
+	${BLENDER_GL_LIBRARIES}
+	${BLENDER_GLEW_LIBRARIES}
+	${CYCLES_APP_GLEW_LIBRARY}
 	${OPENIMAGEIO_LIBRARIES}
 )
 if(WITH_CYCLES_OSL)
@@ -37,10 +42,12 @@ if(WITH_CYCLES_OSL)
 		${LLVM_LIBRARIES}
 	)
 endif()
-if(WITH_IMAGE_OPENJPEG AND NOT WITH_SYSTEM_OPENJPEG)
-	list(APPEND ALL_CYCLES_LIBRARIES
-		extern_openjpeg
-	)
+if(WITH_IMAGE_OPENJPEG)
+	if(WITH_SYSTEM_OPENJPEG)
+		list(APPEND ALL_CYCLES_LIBRARIES ${OPENJPEG_LIBRARIES})
+	else()
+		list(APPEND ALL_CYCLES_LIBRARIES extern_openjpeg)
+	endif()
 endif()
 if(WITH_CYCLES_OPENSUBDIV)
 	add_definitions(-DWITH_OPENSUBDIV)
@@ -52,6 +59,15 @@ if(WITH_CYCLES_OPENSUBDIV)
 		${OPENSUBDIV_LIBRARIES}
 	)
 endif()
+if(WITH_CUDA_DYNLOAD)
+	list(APPEND ALL_CYCLES_LIBRARIES extern_cuew)
+else()
+	list(APPEND ALL_CYCLES_LIBRARIES ${CUDA_CUDA_LIBRARY})
+endif()
+if(NOT CYCLES_STANDALONE_REPOSITORY)
+	list(APPEND ALL_CYCLES_LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc ${GLEW_LIBRARY})
+endif()
+
 list(APPEND ALL_CYCLES_LIBRARIES
 	${BOOST_LIBRARIES}
 	${PNG_LIBRARIES}
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index 3fc086cbc0c..b66a91adbda 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -74,7 +74,7 @@ protected:
 
 class ShaderGraphBuilder {
 public:
-	explicit ShaderGraphBuilder(ShaderGraph *graph)
+	ShaderGraphBuilder(ShaderGraph *graph)
 	  : graph_(graph)
 	{
 		node_map_["Output"] = graph->output();
@@ -155,15 +155,39 @@ protected:
 
 }  // namespace
 
-#define DEFINE_COMMON_VARIABLES(builder_name, mock_log_name) \
-	util_logging_start(); \
-	util_logging_verbosity_set(1); \
-	ScopedMockLog mock_log_name; \
-	DeviceInfo device_info; \
-	SceneParams scene_params; \
-	Scene scene(scene_params, device_info); \
-	ShaderGraph graph; \
-	ShaderGraphBuilder builder(&graph); \
+class RenderGraph : public testing::Test
+{
+protected:
+	ScopedMockLog log;
+	Stats stats;
+	DeviceInfo device_info;
+	Device *device_cpu;
+	SceneParams scene_params;
+	Scene *scene;
+	ShaderGraph graph;
+	ShaderGraphBuilder builder;
+
+	RenderGraph()
+	        : testing::Test(),
+	          builder(&graph)
+	{
+	}
+
+	virtual void SetUp()
+	{
+		util_logging_start();
+		util_logging_verbosity_set(1);
+
+		device_cpu = Device::create(device_info, stats, true);
+		scene = new Scene(scene_params, device_cpu);
+	}
+
+	virtual void TearDown()
+	{
+		delete scene;
+		delete device_cpu;
+	}
+};
 
 #define EXPECT_ANY_MESSAGE(log) \
 	EXPECT_CALL(log, Log(_, _, _)).Times(AnyNumber()); \
@@ -177,10 +201,8 @@ protected:
 /*
  * Test deduplication of nodes that have inputs, some of them folded.
  */
-TEST(render_graph, deduplicate_deep)
+TEST_F(RenderGraph, deduplicate_deep)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Value1::Value to constant (0.8).");
 	CORRECT_INFO_MESSAGE(log, "Folding Value2::Value to constant (0.8).");
@@ -206,7 +228,7 @@ TEST(render_graph, deduplicate_deep)
 		.add_connection("Noise2::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 
 	EXPECT_EQ(graph.nodes.size(), 5);
 }
@@ -214,10 +236,8 @@ TEST(render_graph, deduplicate_deep)
 /*
  * Test RGB to BW node.
  */
-TEST(render_graph, constant_fold_rgb_to_bw)
+TEST_F(RenderGraph, constant_fold_rgb_to_bw)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding RGBToBWNodeNode::Val to constant (0.8).");
 	CORRECT_INFO_MESSAGE(log, "Folding convert_float_to_color::value_color to constant (0.8, 0.8, 0.8).");
@@ -227,17 +247,15 @@ TEST(render_graph, constant_fold_rgb_to_bw)
 		          .set("Color", make_float3(0.8f, 0.8f, 0.8f)))
 		.output_color("RGBToBWNodeNode::Val");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - folding of Emission nodes that don't emit to nothing.
  */
-TEST(render_graph, constant_fold_emission1)
+TEST_F(RenderGraph, constant_fold_emission1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Discarding closure Emission.");
 
@@ -246,13 +264,11 @@ TEST(render_graph, constant_fold_emission1)
 		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
 		.output_closure("Emission::Emission");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
-TEST(render_graph, constant_fold_emission2)
+TEST_F(RenderGraph, constant_fold_emission2)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Discarding closure Emission.");
 
@@ -261,17 +277,15 @@ TEST(render_graph, constant_fold_emission2)
 		          .set("Strength", 0.0f))
 		.output_closure("Emission::Emission");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - folding of Background nodes that don't emit to nothing.
  */
-TEST(render_graph, constant_fold_background1)
+TEST_F(RenderGraph, constant_fold_background1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Discarding closure Background.");
 
@@ -280,13 +294,11 @@ TEST(render_graph, constant_fold_background1)
 		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
 		.output_closure("Background::Background");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
-TEST(render_graph, constant_fold_background2)
+TEST_F(RenderGraph, constant_fold_background2)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Discarding closure Background.");
 
@@ -295,17 +307,15 @@ TEST(render_graph, constant_fold_background2)
 		          .set("Strength", 0.0f))
 		.output_closure("Background::Background");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Add Closure with only one input.
  */
-TEST(render_graph, constant_fold_shader_add)
+TEST_F(RenderGraph, constant_fold_shader_add)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding AddClosure1::Closure to socket Diffuse::BSDF.");
 	CORRECT_INFO_MESSAGE(log, "Folding AddClosure2::Closure to socket Diffuse::BSDF.");
@@ -322,7 +332,7 @@ TEST(render_graph, constant_fold_shader_add)
 		.add_connection("AddClosure2::Closure", "AddClosure3::Closure2")
 		.output_closure("AddClosure3::Closure");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
@@ -330,10 +340,8 @@ TEST(render_graph, constant_fold_shader_add)
  *  - Folding of Mix Closure with 0 or 1 fac.
  *  - Folding of Mix Closure with both inputs folded to the same node.
  */
-TEST(render_graph, constant_fold_shader_mix)
+TEST_F(RenderGraph, constant_fold_shader_mix)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding MixClosure1::Closure to socket Diffuse::BSDF.");
 	CORRECT_INFO_MESSAGE(log, "Folding MixClosure2::Closure to socket Diffuse::BSDF.");
@@ -357,17 +365,15 @@ TEST(render_graph, constant_fold_shader_mix)
 		.add_connection("MixClosure2::Closure", "MixClosure3::Closure2")
 		.output_closure("MixClosure3::Closure");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Invert with all constant inputs.
  */
-TEST(render_graph, constant_fold_invert)
+TEST_F(RenderGraph, constant_fold_invert)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to constant (0.68, 0.5, 0.32).");
 
@@ -377,17 +383,15 @@ TEST(render_graph, constant_fold_invert)
 		          .set("Color", make_float3(0.2f, 0.5f, 0.8f)))
 		.output_color("Invert::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Invert with zero Fac.
  */
-TEST(render_graph, constant_fold_invert_fac_0)
+TEST_F(RenderGraph, constant_fold_invert_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket Attribute::Color.");
 
@@ -398,17 +402,15 @@ TEST(render_graph, constant_fold_invert_fac_0)
 		.add_connection("Attribute::Color", "Invert::Color")
 		.output_color("Invert::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Invert with zero Fac and constant input.
  */
-TEST(render_graph, constant_fold_invert_fac_0_const)
+TEST_F(RenderGraph, constant_fold_invert_fac_0_const)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to constant (0.2, 0.5, 0.8).");
 
@@ -418,17 +420,15 @@ TEST(render_graph, constant_fold_invert_fac_0_const)
 		          .set("Color", make_float3(0.2f, 0.5f, 0.8f)))
 		.output_color("Invert::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of MixRGB Add with all constant inputs (clamp false).
  */
-TEST(render_graph, constant_fold_mix_add)
+TEST_F(RenderGraph, constant_fold_mix_add)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1.14, 1.42).");
 
@@ -441,17 +441,15 @@ TEST(render_graph, constant_fold_mix_add)
 		          .set("Color2", make_float3(0.4, 0.8, 0.9)))
 		.output_color("MixAdd::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of MixRGB Add with all constant inputs (clamp true).
  */
-TEST(render_graph, constant_fold_mix_add_clamp)
+TEST_F(RenderGraph, constant_fold_mix_add_clamp)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1, 1).");
 
@@ -464,17 +462,15 @@ TEST(render_graph, constant_fold_mix_add_clamp)
 		          .set("Color2", make_float3(0.4, 0.8, 0.9)))
 		.output_color("MixAdd::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - No folding on fac 0 for dodge.
  */
-TEST(render_graph, constant_fold_part_mix_dodge_no_fac_0)
+TEST_F(RenderGraph, constant_fold_part_mix_dodge_no_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding ");
 
@@ -489,17 +485,15 @@ TEST(render_graph, constant_fold_part_mix_dodge_no_fac_0)
 		.add_connection("Attribute2::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - No folding on fac 0 for light.
  */
-TEST(render_graph, constant_fold_part_mix_light_no_fac_0)
+TEST_F(RenderGraph, constant_fold_part_mix_light_no_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding ");
 
@@ -514,17 +508,15 @@ TEST(render_graph, constant_fold_part_mix_light_no_fac_0)
 		.add_connection("Attribute2::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - No folding on fac 0 for burn.
  */
-TEST(render_graph, constant_fold_part_mix_burn_no_fac_0)
+TEST_F(RenderGraph, constant_fold_part_mix_burn_no_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding ");
 
@@ -539,17 +531,15 @@ TEST(render_graph, constant_fold_part_mix_burn_no_fac_0)
 		.add_connection("Attribute2::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - No folding on fac 0 for clamped blend.
  */
-TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0)
+TEST_F(RenderGraph, constant_fold_part_mix_blend_clamped_no_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding ");
 
@@ -564,7 +554,7 @@ TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0)
 		.add_connection("Attribute2::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
@@ -572,10 +562,8 @@ TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0)
  *  - Folding of Mix with 0 or 1 Fac.
  *  - Folding of Mix with both inputs folded to the same node.
  */
-TEST(render_graph, constant_fold_part_mix_blend)
+TEST_F(RenderGraph, constant_fold_part_mix_blend)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding MixBlend1::Color to socket Attribute1::Color.");
 	CORRECT_INFO_MESSAGE(log, "Folding MixBlend2::Color to socket Attribute1::Color.");
@@ -607,17 +595,15 @@ TEST(render_graph, constant_fold_part_mix_blend)
 		.add_connection("MixBlend2::Color", "MixBlend3::Color2")
 		.output_color("MixBlend3::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - NOT folding of MixRGB Sub with the same inputs and fac NOT 1.
  */
-TEST(render_graph, constant_fold_part_mix_sub_same_fac_bad)
+TEST_F(RenderGraph, constant_fold_part_mix_sub_same_fac_bad)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding Mix::");
 
@@ -631,17 +617,15 @@ TEST(render_graph, constant_fold_part_mix_sub_same_fac_bad)
 		.add_connection("Attribute::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of MixRGB Sub with the same inputs and fac 1.
  */
-TEST(render_graph, constant_fold_part_mix_sub_same_fac_1)
+TEST_F(RenderGraph, constant_fold_part_mix_sub_same_fac_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Mix::Color to constant (0, 0, 0).");
 
@@ -655,7 +639,7 @@ TEST(render_graph, constant_fold_part_mix_sub_same_fac_1)
 		.add_connection("Attribute::Color", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
@@ -717,10 +701,8 @@ static void build_mix_partial_test_graph(ShaderGraphBuilder &builder, NodeMix ty
 /*
  * Tests: partial folding for RGB Add with known 0.
  */
-TEST(render_graph, constant_fold_part_mix_add_0)
+TEST_F(RenderGraph, constant_fold_part_mix_add_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 0 + X (fac 1) == X */
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
@@ -731,16 +713,14 @@ TEST(render_graph, constant_fold_part_mix_add_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_ADD, make_float3(0, 0, 0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for RGB Sub with known 0.
  */
-TEST(render_graph, constant_fold_part_mix_sub_0)
+TEST_F(RenderGraph, constant_fold_part_mix_sub_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color");
@@ -750,16 +730,14 @@ TEST(render_graph, constant_fold_part_mix_sub_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_SUB, make_float3(0, 0, 0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for RGB Mul with known 1.
  */
-TEST(render_graph, constant_fold_part_mix_mul_1)
+TEST_F(RenderGraph, constant_fold_part_mix_mul_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 1 * X (fac 1) == X */
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
@@ -770,16 +748,14 @@ TEST(render_graph, constant_fold_part_mix_mul_1)
 	INVALID_INFO_MESSAGE(log, "Folding Out");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(1, 1, 1));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for RGB Div with known 1.
  */
-TEST(render_graph, constant_fold_part_mix_div_1)
+TEST_F(RenderGraph, constant_fold_part_mix_div_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
 	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color");
@@ -789,16 +765,14 @@ TEST(render_graph, constant_fold_part_mix_div_1)
 	INVALID_INFO_MESSAGE(log, "Folding Out");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(1, 1, 1));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for RGB Mul with known 0.
  */
-TEST(render_graph, constant_fold_part_mix_mul_0)
+TEST_F(RenderGraph, constant_fold_part_mix_mul_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 0 * ? (fac ?) == 0 */
 	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0).");
@@ -811,16 +785,14 @@ TEST(render_graph, constant_fold_part_mix_mul_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out1234");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(0, 0, 0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for RGB Div with known 0.
  */
-TEST(render_graph, constant_fold_part_mix_div_0)
+TEST_F(RenderGraph, constant_fold_part_mix_div_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 0 / ? (fac ?) == 0 */
 	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0).");
@@ -832,16 +804,14 @@ TEST(render_graph, constant_fold_part_mix_div_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out1234");
 
 	build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(0, 0, 0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Separate/Combine RGB with all constant inputs.
  */
-TEST(render_graph, constant_fold_separate_combine_rgb)
+TEST_F(RenderGraph, constant_fold_separate_combine_rgb)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::R to constant (0.3).");
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::G to constant (0.5).");
@@ -857,16 +827,14 @@ TEST(render_graph, constant_fold_separate_combine_rgb)
 		.add_connection("SeparateRGB::B", "CombineRGB::B")
 		.output_color("CombineRGB::Image");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Separate/Combine XYZ with all constant inputs.
  */
-TEST(render_graph, constant_fold_separate_combine_xyz)
+TEST_F(RenderGraph, constant_fold_separate_combine_xyz)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::X to constant (0.3).");
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::Y to constant (0.5).");
@@ -883,16 +851,14 @@ TEST(render_graph, constant_fold_separate_combine_xyz)
 		.add_connection("SeparateXYZ::Z", "CombineXYZ::Z")
 		.output_color("CombineXYZ::Vector");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Separate/Combine HSV with all constant inputs.
  */
-TEST(render_graph, constant_fold_separate_combine_hsv)
+TEST_F(RenderGraph, constant_fold_separate_combine_hsv)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::H to constant (0.583333).");
 	CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::S to constant (0.571429).");
@@ -908,16 +874,14 @@ TEST(render_graph, constant_fold_separate_combine_hsv)
 		.add_connection("SeparateHSV::V", "CombineHSV::V")
 		.output_color("CombineHSV::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Gamma with all constant inputs.
  */
-TEST(render_graph, constant_fold_gamma)
+TEST_F(RenderGraph, constant_fold_gamma)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Gamma::Color to constant (0.164317, 0.353553, 0.585662).");
 
@@ -927,16 +891,14 @@ TEST(render_graph, constant_fold_gamma)
 		          .set("Gamma", 1.5f))
 		.output_color("Gamma::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Gamma with one constant 0 input.
  */
-TEST(render_graph, constant_fold_gamma_part_0)
+TEST_F(RenderGraph, constant_fold_gamma_part_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	INVALID_INFO_MESSAGE(log, "Folding Gamma_Cx::");
 	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to constant (1, 1, 1).");
@@ -960,16 +922,14 @@ TEST(render_graph, constant_fold_gamma_part_0)
 		.add_connection("Gamma_xC::Color", "Out::Color2")
 		.output_color("Out::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Gamma with one constant 1 input.
  */
-TEST(render_graph, constant_fold_gamma_part_1)
+TEST_F(RenderGraph, constant_fold_gamma_part_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Gamma_Cx::Color to constant (1, 1, 1).");
 	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to socket Attribute::Color.");
@@ -993,16 +953,14 @@ TEST(render_graph, constant_fold_gamma_part_1)
 		.add_connection("Gamma_xC::Color", "Out::Color2")
 		.output_color("Out::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: BrightnessContrast with all constant inputs.
  */
-TEST(render_graph, constant_fold_bright_contrast)
+TEST_F(RenderGraph, constant_fold_bright_contrast)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding BrightContrast::Color to constant (0.16, 0.6, 1.04).");
 
@@ -1013,16 +971,14 @@ TEST(render_graph, constant_fold_bright_contrast)
 		          .set("Contrast", 1.2f))
 		.output_color("BrightContrast::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: blackbody with all constant inputs.
  */
-TEST(render_graph, constant_fold_blackbody)
+TEST_F(RenderGraph, constant_fold_blackbody)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.94163, 0.226523, 0).");
 
@@ -1031,16 +987,14 @@ TEST(render_graph, constant_fold_blackbody)
 		          .set("Temperature", 1200.0f))
 		.output_color("Blackbody::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Math with all constant inputs (clamp false).
  */
-TEST(render_graph, constant_fold_math)
+TEST_F(RenderGraph, constant_fold_math)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1.6).");
 
@@ -1052,16 +1006,14 @@ TEST(render_graph, constant_fold_math)
 		          .set("Value2", 0.9f))
 		.output_value("Math::Value");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Math with all constant inputs (clamp true).
  */
-TEST(render_graph, constant_fold_math_clamp)
+TEST_F(RenderGraph, constant_fold_math_clamp)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1).");
 
@@ -1073,7 +1025,7 @@ TEST(render_graph, constant_fold_math_clamp)
 		          .set("Value2", 0.9f))
 		.output_value("Math::Value");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
@@ -1108,10 +1060,8 @@ static void build_math_partial_test_graph(ShaderGraphBuilder &builder, NodeMath
 /*
  * Tests: partial folding for Math Add with known 0.
  */
-TEST(render_graph, constant_fold_part_math_add_0)
+TEST_F(RenderGraph, constant_fold_part_math_add_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X + 0 == 0 + X == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
@@ -1119,16 +1069,14 @@ TEST(render_graph, constant_fold_part_math_add_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_ADD, 0.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Sub with known 0.
  */
-TEST(render_graph, constant_fold_part_math_sub_0)
+TEST_F(RenderGraph, constant_fold_part_math_sub_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X - 0 == X */
 	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
@@ -1136,16 +1084,14 @@ TEST(render_graph, constant_fold_part_math_sub_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_SUBTRACT, 0.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Mul with known 1.
  */
-TEST(render_graph, constant_fold_part_math_mul_1)
+TEST_F(RenderGraph, constant_fold_part_math_mul_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X * 1 == 1 * X == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
@@ -1153,16 +1099,14 @@ TEST(render_graph, constant_fold_part_math_mul_1)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 1.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Div with known 1.
  */
-TEST(render_graph, constant_fold_part_math_div_1)
+TEST_F(RenderGraph, constant_fold_part_math_div_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X / 1 == X */
 	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
@@ -1170,16 +1114,14 @@ TEST(render_graph, constant_fold_part_math_div_1)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 1.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Mul with known 0.
  */
-TEST(render_graph, constant_fold_part_math_mul_0)
+TEST_F(RenderGraph, constant_fold_part_math_mul_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X * 0 == 0 * X == 0 */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
@@ -1188,16 +1130,14 @@ TEST(render_graph, constant_fold_part_math_mul_0)
 	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
 
 	build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 0.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Div with known 0.
  */
-TEST(render_graph, constant_fold_part_math_div_0)
+TEST_F(RenderGraph, constant_fold_part_math_div_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 0 / X == 0 */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
@@ -1205,16 +1145,14 @@ TEST(render_graph, constant_fold_part_math_div_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 0.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Power with known 0.
  */
-TEST(render_graph, constant_fold_part_math_pow_0)
+TEST_F(RenderGraph, constant_fold_part_math_pow_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X ^ 0 == 1 */
 	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
@@ -1222,16 +1160,14 @@ TEST(render_graph, constant_fold_part_math_pow_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_POWER, 0.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Math Power with known 1.
  */
-TEST(render_graph, constant_fold_part_math_pow_1)
+TEST_F(RenderGraph, constant_fold_part_math_pow_1)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* 1 ^ X == 1; X ^ 1 == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (1)");
@@ -1239,16 +1175,14 @@ TEST(render_graph, constant_fold_part_math_pow_1)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_math_partial_test_graph(builder, NODE_MATH_POWER, 1.0f);
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Vector Math with all constant inputs.
  */
-TEST(render_graph, constant_fold_vector_math)
+TEST_F(RenderGraph, constant_fold_vector_math)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Value to constant (1).");
 	CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Vector to constant (3, 0, 0).");
@@ -1267,7 +1201,7 @@ TEST(render_graph, constant_fold_vector_math)
 		.add_connection("VectorMath::Value", "Math::Value2")
 		.output_color("Math::Value");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
@@ -1299,10 +1233,8 @@ static void build_vecmath_partial_test_graph(ShaderGraphBuilder &builder, NodeVe
 /*
  * Tests: partial folding for Vector Math Add with known 0.
  */
-TEST(render_graph, constant_fold_part_vecmath_add_0)
+TEST_F(RenderGraph, constant_fold_part_vecmath_add_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X + 0 == 0 + X == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to socket Attribute::Vector.");
@@ -1310,16 +1242,14 @@ TEST(render_graph, constant_fold_part_vecmath_add_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_ADD, make_float3(0,0,0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Vector Math Sub with known 0.
  */
-TEST(render_graph, constant_fold_part_vecmath_sub_0)
+TEST_F(RenderGraph, constant_fold_part_vecmath_sub_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X - 0 == X */
 	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
@@ -1327,16 +1257,14 @@ TEST(render_graph, constant_fold_part_vecmath_sub_0)
 	INVALID_INFO_MESSAGE(log, "Folding Out::");
 
 	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_SUBTRACT, make_float3(0,0,0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Vector Math Dot Product with known 0.
  */
-TEST(render_graph, constant_fold_part_vecmath_dot_0)
+TEST_F(RenderGraph, constant_fold_part_vecmath_dot_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X * 0 == 0 * X == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0).");
@@ -1345,16 +1273,14 @@ TEST(render_graph, constant_fold_part_vecmath_dot_0)
 	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
 
 	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_DOT_PRODUCT, make_float3(0,0,0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: partial folding for Vector Math Cross Product with known 0.
  */
-TEST(render_graph, constant_fold_part_vecmath_cross_0)
+TEST_F(RenderGraph, constant_fold_part_vecmath_cross_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	/* X * 0 == 0 * X == X */
 	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0).");
@@ -1363,16 +1289,14 @@ TEST(render_graph, constant_fold_part_vecmath_cross_0)
 	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
 
 	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_CROSS_PRODUCT, make_float3(0,0,0));
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Bump with no height input folded to Normal input.
  */
-TEST(render_graph, constant_fold_bump)
+TEST_F(RenderGraph, constant_fold_bump)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket Geometry1::Normal.");
 
@@ -1382,16 +1306,14 @@ TEST(render_graph, constant_fold_bump)
 		.add_connection("Geometry1::Normal", "Bump::Normal")
 		.output_color("Bump::Normal");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests: Bump with no inputs folded to Geometry::Normal.
  */
-TEST(render_graph, constant_fold_bump_no_input)
+TEST_F(RenderGraph, constant_fold_bump_no_input)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket geometry::Normal.");
 
@@ -1399,7 +1321,7 @@ TEST(render_graph, constant_fold_bump_no_input)
 		.add_node(ShaderNodeBuilder<BumpNode>("Bump"))
 		.output_color("Bump::Normal");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 template<class T>
@@ -1416,10 +1338,8 @@ void init_test_curve(array<T> &buffer, T start, T end, int steps)
  * Tests:
  *  - Folding of RGB Curves with all constant inputs.
  */
-TEST(render_graph, constant_fold_rgb_curves)
+TEST_F(RenderGraph, constant_fold_rgb_curves)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to constant (0.275, 0.5, 0.475).");
 
@@ -1435,17 +1355,15 @@ TEST(render_graph, constant_fold_rgb_curves)
 		          .set("Color", make_float3(0.3f, 0.5f, 0.7f)))
 		.output_color("Curves::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of RGB Curves with zero Fac.
  */
-TEST(render_graph, constant_fold_rgb_curves_fac_0)
+TEST_F(RenderGraph, constant_fold_rgb_curves_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to socket Attribute::Color.");
 
@@ -1462,7 +1380,7 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0)
 		.add_connection("Attribute::Color", "Curves::Color")
 		.output_color("Curves::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 
@@ -1470,10 +1388,8 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0)
  * Tests:
  *  - Folding of RGB Curves with zero Fac and all constant inputs.
  */
-TEST(render_graph, constant_fold_rgb_curves_fac_0_const)
+TEST_F(RenderGraph, constant_fold_rgb_curves_fac_0_const)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to constant (0.3, 0.5, 0.7).");
 
@@ -1489,17 +1405,15 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0_const)
 		          .set("Color", make_float3(0.3f, 0.5f, 0.7f)))
 		.output_color("Curves::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Vector Curves with all constant inputs.
  */
-TEST(render_graph, constant_fold_vector_curves)
+TEST_F(RenderGraph, constant_fold_vector_curves)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to constant (0.275, 0.5, 0.475).");
 
@@ -1515,17 +1429,15 @@ TEST(render_graph, constant_fold_vector_curves)
 		          .set("Vector", make_float3(0.3f, 0.5f, 0.7f)))
 		.output_color("Curves::Vector");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Vector Curves with zero Fac.
  */
-TEST(render_graph, constant_fold_vector_curves_fac_0)
+TEST_F(RenderGraph, constant_fold_vector_curves_fac_0)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to socket Attribute::Vector.");
 
@@ -1542,17 +1454,15 @@ TEST(render_graph, constant_fold_vector_curves_fac_0)
 		.add_connection("Attribute::Vector", "Curves::Vector")
 		.output_color("Curves::Vector");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Color Ramp with all constant inputs.
  */
-TEST(render_graph, constant_fold_rgb_ramp)
+TEST_F(RenderGraph, constant_fold_rgb_ramp)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.14, 0.39, 0.64).");
 	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.89).");
@@ -1574,17 +1484,15 @@ TEST(render_graph, constant_fold_rgb_ramp)
 		.add_connection("Ramp::Alpha", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of Color Ramp with all constant inputs (interpolate false).
  */
-TEST(render_graph, constant_fold_rgb_ramp_flat)
+TEST_F(RenderGraph, constant_fold_rgb_ramp_flat)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.125, 0.375, 0.625).");
 	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.875).");
@@ -1606,17 +1514,15 @@ TEST(render_graph, constant_fold_rgb_ramp_flat)
 		.add_connection("Ramp::Alpha", "Mix::Color2")
 		.output_color("Mix::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of redundant conversion of float to color to float.
  */
-TEST(render_graph, constant_fold_convert_float_color_float)
+TEST_F(RenderGraph, constant_fold_convert_float_color_float)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket convert_float_to_color::value_color.");
 	CORRECT_INFO_MESSAGE(log, "Folding convert_color_to_float::value_float to socket Attribute::Fac.");
@@ -1628,17 +1534,15 @@ TEST(render_graph, constant_fold_convert_float_color_float)
 		.add_connection("Attribute::Fac", "Invert::Color")
 		.output_value("Invert::Color");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - Folding of redundant conversion of color to vector to color.
  */
-TEST(render_graph, constant_fold_convert_color_vector_color)
+TEST_F(RenderGraph, constant_fold_convert_color_vector_color)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding VecAdd::Vector to socket convert_color_to_vector::value_vector.");
 	CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_color::value_color to socket Attribute::Color.");
@@ -1651,17 +1555,15 @@ TEST(render_graph, constant_fold_convert_color_vector_color)
 		.add_connection("Attribute::Color", "VecAdd::Vector1")
 		.output_color("VecAdd::Vector");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 /*
  * Tests:
  *  - NOT folding conversion of color to float to color.
  */
-TEST(render_graph, constant_fold_convert_color_float_color)
+TEST_F(RenderGraph, constant_fold_convert_color_float_color)
 {
-	DEFINE_COMMON_VARIABLES(builder, log);
-
 	EXPECT_ANY_MESSAGE(log);
 	CORRECT_INFO_MESSAGE(log, "Folding MathAdd::Value to socket convert_color_to_float::value_float.");
 	INVALID_INFO_MESSAGE(log, "Folding convert_float_to_color::");
@@ -1674,7 +1576,7 @@ TEST(render_graph, constant_fold_convert_color_float_color)
 		.add_connection("Attribute::Color", "MathAdd::Value1")
 		.output_color("MathAdd::Value");
 
-	graph.finalize(&scene);
+	graph.finalize(scene);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 22ec8e0ee8e..6c059ba5d12 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both)
 	EXPECT_EQ(str, "foo bar zzz");
 }
 
+TEST(util_string_remove_trademark, both_space)
+{
+	string str = string_remove_trademark("foo bar(TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, both_space_around)
+{
+	string str = string_remove_trademark("foo bar (TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, trademark_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (TM)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, trademark_space_middle)
+{
+	string str = string_remove_trademark("foo bar (TM) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
+
+TEST(util_string_remove_trademark, r_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (R)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, r_space_middle)
+{
+	string str = string_remove_trademark("foo bar (R) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index d8abf671bd6..24043e2231b 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 set(INC
-	.
+	..
 	../../glew-mx
 )
 
@@ -38,6 +38,7 @@ set(SRC_HEADERS
 	util_atomic.h
 	util_boundbox.h
 	util_debug.h
+	util_defines.h
 	util_guarded_allocator.cpp
 	util_foreach.h
 	util_function.h
@@ -52,13 +53,23 @@ set(SRC_HEADERS
 	util_math.h
 	util_math_cdf.h
 	util_math_fast.h
+	util_math_intersect.h
+	util_math_float2.h
+	util_math_float3.h
+	util_math_float4.h
+	util_math_int2.h
+	util_math_int3.h
+	util_math_int4.h
+	util_math_matrix.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
 	util_param.h
 	util_path.h
 	util_progress.h
+	util_projection.h
 	util_queue.h
+	util_rect.h
 	util_set.h
 	util_simd.h
 	util_sky_model.cpp
@@ -79,6 +90,32 @@ set(SRC_HEADERS
 	util_time.h
 	util_transform.h
 	util_types.h
+	util_types_float2.h
+	util_types_float2_impl.h
+	util_types_float3.h
+	util_types_float3_impl.h
+	util_types_float4.h
+	util_types_float4_impl.h
+	util_types_int2.h
+	util_types_int2_impl.h
+	util_types_int3.h
+	util_types_int3_impl.h
+	util_types_int4.h
+	util_types_int4_impl.h
+	util_types_uchar2.h
+	util_types_uchar2_impl.h
+	util_types_uchar3.h
+	util_types_uchar3_impl.h
+	util_types_uchar4.h
+	util_types_uchar4_impl.h
+	util_types_uint2.h
+	util_types_uint2_impl.h
+	util_types_uint3.h
+	util_types_uint3_impl.h
+	util_types_uint4.h
+	util_types_uint4_impl.h
+	util_types_vector3.h
+	util_types_vector3_impl.h
 	util_vector.h
 	util_version.h
 	util_view.h
@@ -91,4 +128,4 @@ include_directories(SYSTEM ${INC_SYS})
 
 add_definitions(${GL_DEFINITIONS})
 
-add_library(cycles_util ${SRC} ${SRC_HEADERS})
+cycles_add_library(cycles_util ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
index 15d2eb3271b..cc7252dcc58 100644
--- a/intern/cycles/util/util_aligned_malloc.cpp
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
 
 #include <cassert>
 
diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h
index ecc0f28c376..66d77c83454 100644
--- a/intern/cycles/util/util_aligned_malloc.h
+++ b/intern/cycles/util/util_aligned_malloc.h
@@ -17,10 +17,13 @@
 #ifndef __UTIL_ALIGNED_MALLOC_H__
 #define __UTIL_ALIGNED_MALLOC_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
+/* Minimum alignment needed by all CPU native data types (SSE, AVX). */
+#define MIN_ALIGNMENT_CPU_DATA_TYPES 16
+
 /* Allocate block of size bytes at least aligned to a given value. */
 void *util_aligned_malloc(size_t size, int alignment);
 
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 433e41fbbb6..f3c7ae546a0 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -22,15 +22,13 @@
 /* Using atomic ops header from Blender. */
 #include "atomic_ops.h"
 
-ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
-{
-	size_t prev_value = *maximum_value;
-	while(prev_value < value) {
-		if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
-			break;
-		}
-	}
-}
+#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
+
+#define CCL_LOCAL_MEM_FENCE 0
+#define ccl_barrier(flags) (void)0
 
 #else  /* __KERNEL_GPU__ */
 
@@ -39,7 +37,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
@@ -56,10 +54,32 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou
 	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
 	                       prev_value.int_value,
 	                       new_value.int_value) != prev_value.int_value);
+	return new_value.float_value;
 }
 
+#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
+#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+
+#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) barrier(flags)
+
 #endif  /* __KERNEL_OPENCL__ */
 
+#ifdef __KERNEL_CUDA__
+
+#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
+
+#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) __syncthreads()
+
+#endif  /* __KERNEL_CUDA__ */
+
 #endif  /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index dfe4977aef3..ed94ca20211 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -20,10 +20,10 @@
 #include <math.h>
 #include <float.h>
 
-#include "util_math.h"
-#include "util_string.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index d3598f84b94..c73beab98dc 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -17,11 +17,11 @@
 #ifndef __UTIL_COLOR_H__
 #define __UTIL_COLOR_H__
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z)
 	                   0.055648f * x + -0.204043f * y +  1.057311f * z);
 }
 
-#ifndef __KERNEL_OPENCL__
-
-ccl_device float3 color_srgb_to_scene_linear(float3 c)
-{
-	return make_float3(
-		color_srgb_to_scene_linear(c.x),
-		color_srgb_to_scene_linear(c.y),
-		color_srgb_to_scene_linear(c.z));
-}
-
 #ifdef __KERNEL_SSE2__
 /*
  * Calculate initial guess for arg^exp based on float representation
@@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
 	ssef gte = fastpow24(gtebase);
 	return select(cmp, lt, gte);
 }
-#endif
+#endif  /* __KERNEL_SSE2__ */
 
-ccl_device float3 color_scene_linear_to_srgb(float3 c)
+ccl_device float3 color_srgb_to_scene_linear_v3(float3 c)
 {
-	return make_float3(
-		color_scene_linear_to_srgb(c.x),
-		color_scene_linear_to_srgb(c.y),
-		color_scene_linear_to_srgb(c.z));
+	return make_float3(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z));
 }
 
+ccl_device float3 color_scene_linear_to_srgb_v3(float3 c)
+{
+	return make_float3(color_scene_linear_to_srgb(c.x),
+	                   color_scene_linear_to_srgb(c.y),
+	                   color_scene_linear_to_srgb(c.z));
+}
+
+ccl_device float4 color_srgb_to_scene_linear_v4(float4 c)
+{
+#ifdef __KERNEL_SSE2__
+	ssef r_ssef;
+	float4 &r = (float4 &)r_ssef;
+	r = c;
+	r_ssef = color_srgb_to_scene_linear(r_ssef);
+	r.w = c.w;
+	return r;
+#else
+	return make_float4(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z),
+	                   c.w);
 #endif
+}
 
 ccl_device float linear_rgb_to_gray(float3 c)
 {
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 80d177d2cae..9a66a372822 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
+#include "util/util_debug.h"
 
 #include <stdlib.h>
 
-#include "util_logging.h"
-#include "util_string.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_logging.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,7 +31,8 @@ DebugFlags::CPU::CPU()
     sse41(true),
     sse3(true),
     sse2(true),
-    qbvh(true)
+    bvh_layout(BVH_LAYOUT_DEFAULT),
+    split_kernel(false)
 {
 	reset();
 }
@@ -54,11 +57,13 @@ void DebugFlags::CPU::reset()
 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
 
-	qbvh = true;
+	bvh_layout = BVH_LAYOUT_DEFAULT;
+	split_kernel = false;
 }
 
 DebugFlags::CUDA::CUDA()
-  : adaptive_compile(false)
+  : adaptive_compile(false),
+    split_kernel(false)
 {
 	reset();
 }
@@ -67,12 +72,15 @@ void DebugFlags::CUDA::reset()
 {
 	if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
 		adaptive_compile = true;
+
+	split_kernel = false;
 }
 
 DebugFlags::OpenCL::OpenCL()
   : device_type(DebugFlags::OpenCL::DEVICE_ALL),
     kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT),
-    debug(false)
+    debug(false),
+    single_program(false)
 {
 	reset();
 }
@@ -112,16 +120,20 @@ void DebugFlags::OpenCL::reset()
 	}
 	/* Initialize other flags from environment variables. */
 	debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+	single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL);
 }
 
 DebugFlags::DebugFlags()
+: viewport_static_bvh(false)
 {
 	/* Nothing for now. */
 }
 
 void DebugFlags::reset()
 {
+	viewport_static_bvh = false;
 	cpu.reset();
+	cuda.reset();
 	opencl.reset();
 }
 
@@ -129,11 +141,13 @@ std::ostream& operator <<(std::ostream &os,
                           DebugFlagsConstRef debug_flags)
 {
 	os << "CPU flags:\n"
-	   << "  AVX2   : " << string_from_bool(debug_flags.cpu.avx2)  << "\n"
-	   << "  AVX    : " << string_from_bool(debug_flags.cpu.avx)   << "\n"
-	   << "  SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
-	   << "  SSE3   : " << string_from_bool(debug_flags.cpu.sse3)  << "\n"
-	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n";
+	   << "  AVX2       : " << string_from_bool(debug_flags.cpu.avx2) << "\n"
+	   << "  AVX        : " << string_from_bool(debug_flags.cpu.avx) << "\n"
+	   << "  SSE4.1     : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
+	   << "  SSE3       : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
+	   << "  SSE2       : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
+	   << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
+	   << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
 	os << "CUDA flags:\n"
 	   << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
@@ -172,10 +186,11 @@ std::ostream& operator <<(std::ostream &os,
 			break;
 	}
 	os << "OpenCL flags:\n"
-	   << "  Device type : " << opencl_device_type << "\n"
-	   << "  Kernel type : " << opencl_kernel_type << "\n"
-	   << "  Debug       : " << string_from_bool(debug_flags.opencl.debug)
-	   << "\n";
+	   << "  Device type    : " << opencl_device_type << "\n"
+	   << "  Kernel type    : " << opencl_kernel_type << "\n"
+	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
+	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
+	   << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
 	return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 73fd228b5d9..f17f8a560ee 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -20,7 +20,7 @@
 #include <cassert>
 #include <iostream>
 
-#include "util_static_assert.h"
+#include "bvh/bvh_params.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN
  */
 class DebugFlags {
 public:
+	/* Use static BVH in viewport, to match final render exactly. */
+	bool viewport_static_bvh;
+
 	/* Descriptor of CPU feature-set to be used. */
 	struct CPU {
 		CPU();
@@ -44,8 +47,24 @@ public:
 		bool sse3;
 		bool sse2;
 
-		/* Whether QBVH usage is allowed or not. */
-		bool qbvh;
+		/* Check functions to see whether instructions up to the given one
+		 * are allowed for use.
+		 */
+		bool has_avx2()  { return has_avx()   && avx2; }
+		bool has_avx()   { return has_sse41() && avx; }
+		bool has_sse41() { return has_sse3()  && sse41; }
+		bool has_sse3()  { return has_sse2()  && sse3; }
+		bool has_sse2()  { return sse2; }
+
+		/* Requested BVH size.
+		 *
+		 * Rendering will use widest possible BVH which is below or equal
+		 * this one.
+		 */
+		BVHLayout bvh_layout;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of CUDA feature-set to be used. */
@@ -58,6 +77,9 @@ public:
 		/* Whether adaptive feature based runtime compile is enabled or not.
 		 * Requires the CUDA Toolkit and only works on Linux atm. */
 		bool adaptive_compile;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of OpenCL feature-set to be used. */
@@ -106,6 +128,13 @@ public:
 
 		/* Use debug version of the kernel. */
 		bool debug;
+
+		/* Use single program */
+		bool single_program;
+
+		/* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+		/* Artificial memory limit in bytes (0 if disabled). */
+		size_t mem_limit;
 	};
 
 	/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644
index 00000000000..98944a19022
--- /dev/null
+++ b/intern/cycles/util/util_defines.h
@@ -0,0 +1,136 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_static_constant static const
+#  define ccl_constant const
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define ccl_ref &
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+	TYPEOF(var) *__tmp;           \
+	__tmp = (type *)NULL;         \
+	(void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+	TYPEOF(var_a) *__tmp;                 \
+	__tmp = (typeof(var_b) *)NULL;        \
+	(void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+	((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp
index 615ac95f324..54fa6a80df5 100644
--- a/intern/cycles/util/util_guarded_allocator.cpp
+++ b/intern/cycles/util/util_guarded_allocator.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_guarded_allocator.h"
-#include "util_stats.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_stats.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 78453d214be..87c1526dee4 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -20,9 +20,6 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
-
 #ifdef WITH_BLENDER_GUARDEDALLOC
 #  include "../../guardedalloc/MEM_guardedalloc.h"
 #endif
@@ -50,9 +47,9 @@ public:
 
 	T *allocate(size_t n, const void *hint = 0)
 	{
+		(void)hint;
 		size_t size = n * sizeof(T);
 		util_guarded_mem_alloc(size);
-		(void)hint;
 		if(n == 0) {
 			return NULL;
 		}
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 7285c6ef600..612228dd1c1 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -17,10 +17,11 @@
 #ifndef __UTIL_HALF_H__
 #define __UTIL_HALF_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "util/util_math.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index 98c3a681ff2..2307ca158f0 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_HASH_H__
 #define __UTIL_HASH_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -61,6 +61,11 @@ static inline uint hash_string(const char *str)
 }
 #endif
 
+ccl_device_inline float hash_int_01(uint k)
+{
+	return (float)hash_int(k) * (1.0f/(float)0xFFFFFFFF);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_HASH_H__ */
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index c8efc551d97..18876841b5b 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -21,7 +21,7 @@
 
 #include <OpenImageIO/imageio.h>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,4 +42,4 @@ CCL_NAMESPACE_END
 
 #endif /* __UTIL_IMAGE_H__ */
 
-#include "util_image_impl.h"
+#include "util/util_image_impl.h"
diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h
index 4daf1eaac22..751f52aaa86 100644
--- a/intern/cycles/util/util_image_impl.h
+++ b/intern/cycles/util/util_image_impl.h
@@ -17,10 +17,9 @@
 #ifndef __UTIL_IMAGE_IMPL_H__
 #define __UTIL_IMAGE_IMPL_H__
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_half.h"
-#include "util_image.h"
+#include "util/util_algorithm.h"
+#include "util/util_half.h"
+#include "util/util_image.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 03041723e15..f38683bf7de 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <util_logging.h>
+#include "util/util_logging.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #include <stdio.h>
 #ifdef _MSC_VER
@@ -30,10 +30,10 @@ void util_logging_init(const char *argv0)
 #ifdef WITH_CYCLES_LOGGING
 	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
 
-	/* Make it so FATAL messages are always print into console. */
+	/* Make it so ERROR messages are always print into console. */
 	char severity_fatal[32];
 	snprintf(severity_fatal, sizeof(severity_fatal), "%d",
-	         google::GLOG_FATAL);
+	         google::GLOG_ERROR);
 
 	google::InitGoogleLogging(argv0);
 	SetCommandLineOption("logtostderr", "1");
@@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity)
 }
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value)
+{
+	os << "(" << value.x
+	   << ", " << value.y
+	   << ")";
+	return os;
+}
+
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value)
 {
 	os << "(" << value.x
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 2aa9c25b1a0..5c84b6593d3 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -18,33 +18,37 @@
 #define __UTIL_LOGGING_H__
 
 #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
+#  include <gflags/gflags.h>
 #  include <glog/logging.h>
-#else
-#  include <iostream>
 #endif
 
+#include <iostream>
+
 CCL_NAMESPACE_BEGIN
 
 #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
-class StubStream : public std::ostream {
- public:
-	StubStream() : std::ostream(NULL) { }
+class StubStream {
+public:
+	template<class T>
+	StubStream& operator<<(const T&) {
+		return *this;
+	}
 };
 
 class LogMessageVoidify {
 public:
 	LogMessageVoidify() { }
-	void operator&(::std::ostream&) { }
+	void operator&(StubStream&) { }
 };
 
 #  define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
-
 #endif
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
 
+struct int2;
 struct float3;
 
 void util_logging_init(const char *argv0);
@@ -52,6 +56,8 @@ void util_logging_start(void);
 void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value);
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2b81c8c498a..d0e91a2a1c9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -28,14 +28,12 @@
 
 
 #ifndef __KERNEL_OPENCL__
+#  include <float.h>
+#  include <math.h>
+#  include <stdio.h>
+#endif  /* __KERNEL_OPENCL__ */
 
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-
-#endif
-
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN
 
 /* Division */
 #ifndef M_PI_F
-#define M_PI_F		((float)3.14159265358979323846264338327950288) 		/* pi */
+#  define M_PI_F    (3.1415926535897932f)  /* pi */
 #endif
 #ifndef M_PI_2_F
-#define M_PI_2_F	((float)1.57079632679489661923132169163975144) 		/* pi/2 */
+#  define M_PI_2_F  (1.5707963267948966f)  /* pi/2 */
 #endif
 #ifndef M_PI_4_F
-#define M_PI_4_F	((float)0.785398163397448309615660845819875721) 	/* pi/4 */
+#  define M_PI_4_F  (0.7853981633974830f)  /* pi/4 */
 #endif
 #ifndef M_1_PI_F
-#define M_1_PI_F	((float)0.318309886183790671537767526745028724) 	/* 1/pi */
+#  define M_1_PI_F  (0.3183098861837067f)  /* 1/pi */
 #endif
 #ifndef M_2_PI_F
-#define M_2_PI_F	((float)0.636619772367581343075535053490057448) 	/* 2/pi */
+#  define M_2_PI_F  (0.6366197723675813f)  /* 2/pi */
 #endif
 
 /* Multiplication */
 #ifndef M_2PI_F
-#define M_2PI_F		((float)6.283185307179586476925286766559005768)		/* 2*pi */
+#  define M_2PI_F   (6.2831853071795864f)  /* 2*pi */
 #endif
 #ifndef M_4PI_F
-#define M_4PI_F		((float)12.56637061435917295385057353311801153)		/* 4*pi */
+#  define M_4PI_F   (12.566370614359172f)  /* 4*pi */
 #endif
 
 /* Float sqrt variations */
-
 #ifndef M_SQRT2_F
-#define M_SQRT2_F	((float)1.41421356237309504880) 					/* sqrt(2) */
+#  define M_SQRT2_F (1.4142135623730950f)  /* sqrt(2) */
 #endif
-
 #ifndef M_LN2_F
-#define M_LN2_F      ((float)0.6931471805599453)        /* ln(2) */
+#  define M_LN2_F   (0.6931471805599453f)  /* ln(2) */
 #endif
-
 #ifndef M_LN10_F
-#define M_LN10_F     ((float)2.3025850929940457)        /* ln(10) */
+#  define M_LN10_F  (2.3025850929940457f)  /* ln(10) */
 #endif
 
 /* Scalar */
 
 #ifdef _WIN32
-
-#ifndef __KERNEL_OPENCL__
-
+#  ifndef __KERNEL_OPENCL__
 ccl_device_inline float fmaxf(float a, float b)
 {
 	return (a > b)? a: b;
@@ -95,15 +88,13 @@ ccl_device_inline float fminf(float a, float b)
 {
 	return (a < b)? a: b;
 }
-
-#endif
-
-#endif
+#  endif  /* !__KERNEL_OPENCL__ */
+#endif  /* _WIN32 */
 
 #ifndef __KERNEL_GPU__
-
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
@@ -157,8 +148,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d)
 {
 	return max(max(a,b),max(c,d));
 }
-
-#endif
+#endif /* __KERNEL_GPU__ */
 
 ccl_device_inline float min4(float a, float b, float c, float d)
 {
@@ -170,525 +160,141 @@ ccl_device_inline float max4(float a, float b, float c, float d)
 	return max(max(a, b), max(c, d));
 }
 
-ccl_device_inline float max3(float3 a)
-{
-	return max(max(a.x, a.y), a.z);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int clamp(int a, int mn, int mx)
-{
-	return min(max(a, mn), mx);
-}
-
-ccl_device_inline float clamp(float a, float mn, float mx)
-{
-	return min(max(a, mn), mx);
-}
-
-ccl_device_inline float mix(float a, float b, float t)
-{
-    return a + t*(b - a);
-}
-
-#endif
-
-#ifndef __KERNEL_CUDA__
-
-ccl_device_inline float saturate(float a)
-{
-	return clamp(a, 0.0f, 1.0f);
-}
-
-#endif
-
-ccl_device_inline int float_to_int(float f)
-{
-	return (int)f;
-}
-
-ccl_device_inline int floor_to_int(float f)
-{
-	return float_to_int(floorf(f));
-}
-
-ccl_device_inline int ceil_to_int(float f)
-{
-	return float_to_int(ceilf(f));
-}
-
-ccl_device_inline float signf(float f)
-{
-	return (f < 0.0f)? -1.0f: 1.0f;
-}
-
-ccl_device_inline float nonzerof(float f, float eps)
-{
-	if(fabsf(f) < eps)
-		return signf(f)*eps;
-	else
-		return f;
-}
-
-ccl_device_inline float smoothstepf(float f)
-{
-	float ff = f*f;
-	return (3.0f*ff - 2.0f*ff*f);
-}
-
-ccl_device_inline int mod(int x, int m)
-{
-	return (x % m + m) % m;
-}
-
-/* Float2 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool is_zero(const float2& a)
-{
-	return (a.x == 0.0f && a.y == 0.0f);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float average(const float2& a)
-{
-	return (a.x + a.y)*(1.0f/2.0f);
-}
-
-#endif
-
 #ifndef __KERNEL_OPENCL__
+/* Int/Float conversion */
 
-ccl_device_inline float2 operator-(const float2& a)
-{
-	return make_float2(-a.x, -a.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, const float2& b)
-{
-	return make_float2(a.x*b.x, a.y*b.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, float f)
-{
-	return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator*(float f, const float2& a)
-{
-	return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator/(float f, const float2& a)
-{
-	return make_float2(f/a.x, f/a.y);
-}
-
-ccl_device_inline float2 operator/(const float2& a, float f)
-{
-	float invf = 1.0f/f;
-	return make_float2(a.x*invf, a.y*invf);
-}
-
-ccl_device_inline float2 operator/(const float2& a, const float2& b)
+ccl_device_inline int as_int(uint i)
 {
-	return make_float2(a.x/b.x, a.y/b.y);
+	union { uint ui; int i; } u;
+	u.ui = i;
+	return u.i;
 }
 
-ccl_device_inline float2 operator+(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(int i)
 {
-	return make_float2(a.x+b.x, a.y+b.y);
+	union { uint ui; int i; } u;
+	u.i = i;
+	return u.ui;
 }
 
-ccl_device_inline float2 operator-(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(float f)
 {
-	return make_float2(a.x-b.x, a.y-b.y);
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator+=(float2& a, const float2& b)
+ccl_device_inline int __float_as_int(float f)
 {
-	return a = a + b;
+	union { int i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator*=(float2& a, const float2& b)
+ccl_device_inline float __int_as_float(int i)
 {
-	return a = a * b;
+	union { int i; float f; } u;
+	u.i = i;
+	return u.f;
 }
 
-ccl_device_inline float2 operator*=(float2& a, float f)
+ccl_device_inline uint __float_as_uint(float f)
 {
-	return a = a * f;
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator/=(float2& a, const float2& b)
+ccl_device_inline float __uint_as_float(uint i)
 {
-	return a = a / b;
+	union { uint i; float f; } u;
+	u.i = i;
+	return u.f;
 }
+#endif /* __KERNEL_OPENCL__ */
 
-ccl_device_inline float2 operator/=(float2& a, float f)
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
 {
-	float invf = 1.0f/f;
-	return a = a * invf;
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
 }
 
-
-ccl_device_inline float dot(const float2& a, const float2& b)
+ccl_device_inline bool isfinite_safe(float f)
 {
-	return a.x*b.x + a.y*b.y;
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
 }
 
-ccl_device_inline float cross(const float2& a, const float2& b)
+ccl_device_inline float ensure_finite(float v)
 {
-	return (a.x*b.y - a.y*b.x);
+	return isfinite_safe(v)? v : 0.0f;
 }
 
-#endif
-
 #ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const int2 a, const int2 b)
-{
-	return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline float len(const float2& a)
-{
-	return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float2 normalize(const float2& a)
-{
-	return a/len(a);
-}
-
-ccl_device_inline float2 normalize_len(const float2& a, float *t)
-{
-	*t = len(a);
-	return a/(*t);
-}
-
-ccl_device_inline float2 safe_normalize(const float2& a)
-{
-	float t = len(a);
-	return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline bool operator==(const float2& a, const float2& b)
-{
-	return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline bool operator!=(const float2& a, const float2& b)
-{
-	return !(a == b);
-}
-
-ccl_device_inline float2 min(const float2& a, const float2& b)
-{
-	return make_float2(min(a.x, b.x), min(a.y, b.y));
-}
-
-ccl_device_inline float2 max(const float2& a, const float2& b)
-{
-	return make_float2(max(a.x, b.x), max(a.y, b.y));
-}
-
-ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+ccl_device_inline int clamp(int a, int mn, int mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(const float2& a)
-{
-	return make_float2(fabsf(a.x), fabsf(a.y));
-}
-
-ccl_device_inline float2 as_float2(const float4& a)
-{
-	return make_float2(a.x, a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float2(const char *label, const float2& a)
-{
-	printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
-{
-	return a + t*(b - a);
-}
-
-#endif
-
-/* Float3 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 operator-(const float3& a)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
-	return make_float3(-a.x, -a.y, -a.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_mul_ps(a.m128,b.m128));
-#else
-	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float f)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
-#else
-	return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float f, const float3& a)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
-	return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float f, const float3& a)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
-#else
-	return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float f)
-{
-	float invf = 1.0f/f;
-	return a * invf;
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float3& b)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
-#else
-	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_add_ps(a.m128, b.m128));
-#else
-	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
-}
-
-ccl_device_inline float3 operator-(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_sub_ps(a.m128, b.m128));
-#else
-	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+=(float3& a, const float3& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, const float3& b)
-{
-	return a = a * b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, float f)
-{
-	return a = a * f;
-}
-
-ccl_device_inline float3 operator/=(float3& a, const float3& b)
-{
-	return a = a / b;
-}
-
-ccl_device_inline float3 operator/=(float3& a, float f)
-{
-	float invf = 1.0f/f;
-	return a = a * invf;
-}
-
-ccl_device_inline float dot(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else	
-	return a.x*b.x + a.y*b.y + a.z*b.z;
-#endif
-}
-
-ccl_device_inline float dot_xy(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
-#else
-	return a.x*b.x + a.y*b.y;
-#endif
-}
-
-ccl_device_inline float dot(const float4& a, const float4& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#else	
-	return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
-#endif
-}
-
-ccl_device_inline float3 cross(const float3& a, const float3& b)
-{
-	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
-	return r;
-}
-
-#endif
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-	return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-	return dot(a, a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float len_squared(const float4& a)
-{
-	return dot(a, a);
-}
-
-ccl_device_inline float3 normalize(const float3& a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
-	return _mm_div_ps(a.m128, norm);
-#else
-	return a/len(a);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 saturate3(float3 a)
+ccl_device_inline float clamp(float a, float mn, float mx)
 {
-	return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 normalize_len(const float3 a, float *t)
+ccl_device_inline float mix(float a, float b, float t)
 {
-	*t = len(a);
-	float x = 1.0f / *t;
-	return a*x;
+    return a + t*(b - a);
 }
+#endif  /* __KERNEL_OPENCL__ */
 
-ccl_device_inline float3 safe_normalize(const float3 a)
+#ifndef __KERNEL_CUDA__
+ccl_device_inline float saturate(float a)
 {
-	float t = len(a);
-	return (t != 0.0f)? a * (1.0f/t) : a;
+	return clamp(a, 0.0f, 1.0f);
 }
+#endif  /* __KERNEL_CUDA__ */
 
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+ccl_device_inline int float_to_int(float f)
 {
-	*t = len(a);
-	return (*t != 0.0f)? a/(*t): a;
+	return (int)f;
 }
 
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const float3& a, const float3& b)
+ccl_device_inline int floor_to_int(float f)
 {
-#ifdef __KERNEL_SSE__
-	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
-	return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+	return float_to_int(floorf(f));
 }
 
-ccl_device_inline bool operator!=(const float3& a, const float3& b)
+ccl_device_inline int ceil_to_int(float f)
 {
-	return !(a == b);
+	return float_to_int(ceilf(f));
 }
 
-ccl_device_inline float3 min(const float3& a, const float3& b)
+ccl_device_inline float signf(float f)
 {
-#ifdef __KERNEL_SSE__
-	return _mm_min_ps(a.m128, b.m128);
-#else
-	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+	return (f < 0.0f)? -1.0f: 1.0f;
 }
 
-ccl_device_inline float3 max(const float3& a, const float3& b)
+ccl_device_inline float nonzerof(float f, float eps)
 {
-#ifdef __KERNEL_SSE__
-	return _mm_max_ps(a.m128, b.m128);
-#else
-	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+	if(fabsf(f) < eps)
+		return signf(f)*eps;
+	else
+		return f;
 }
 
-ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+ccl_device_inline float smoothstepf(float f)
 {
-	return min(max(a, mn), mx);
+	float ff = f*f;
+	return (3.0f*ff - 2.0f*ff*f);
 }
 
-ccl_device_inline float3 fabs(const float3& a)
+ccl_device_inline int mod(int x, int m)
 {
-#ifdef __KERNEL_SSE__
-	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-	return _mm_and_ps(a.m128, mask);
-#else
-	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+	return (x % m + m) % m;
 }
 
-#endif
-
 ccl_device_inline float3 float2_to_float3(const float2 a)
 {
 	return make_float3(a.x, a.y, 0.0f);
@@ -704,557 +310,21 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
 	return make_float4(a.x, a.y, a.z, 1.0f);
 }
 
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float3(const char *label, const float3& a)
-{
-	printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
-}
-
-ccl_device_inline float3 rcp(const float3& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 r = _mm_rcp_ps(a.m128);
-	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
-	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 interp(float3 a, float3 b, float t)
-{
-	return a + t*(b - a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
-{
-	return a + t*(b - a);
-}
-
-#endif
-
-ccl_device_inline bool is_zero(const float3 a)
-{
-#ifdef __KERNEL_SSE__
-	return a == make_float3(0.0f);
-#else
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float3 a)
-{
-	return (a.x + a.y + a.z);
-}
-
-ccl_device_inline float average(const float3 a)
-{
-	return reduce_add(a)*(1.0f/3.0f);
-}
-
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
-{
-#ifdef __KERNEL_OPENCL__
-	return all(a == b);
-#else
-	return a == b;
-#endif
-}
-
-/* Float4 Vector */
-
-#ifdef __KERNEL_SSE__
-
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
-{
-	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
-}
-
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
-{
-	return _mm_moveldup_ps(b);
-}
-
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
-{
-	return _mm_movehdup_ps(b);
-}
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
-{
-	return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float4 operator-(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-	return _mm_xor_ps(a.m128, mask);
-#else
-	return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_mul_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, float f)
-{
-#if defined(__KERNEL_SSE__)
-	return a * make_float4(f);
-#else
-	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
-#endif
-}
-
-ccl_device_inline float4 operator*(float f, const float4& a)
-{
-	return a * f;
-}
-
-ccl_device_inline float4 rcp(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 r = _mm_rcp_ps(a.m128);
-	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
-	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
-#endif
-}
-
-ccl_device_inline float4 operator/(const float4& a, float f)
-{
-	return a * (1.0f/f);
-}
-
-ccl_device_inline float4 operator/(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return a * rcp(b);
-#else
-	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-#endif
-
-}
-
-ccl_device_inline float4 operator+(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_add_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline float4 operator-(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_sub_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-#endif
-}
-
-ccl_device_inline float4 operator+=(float4& a, const float4& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline float4 operator*=(float4& a, const float4& b)
-{
-	return a = a * b;
-}
-
-ccl_device_inline float4 operator/=(float4& a, float f)
-{
-	return a = a / f;
-}
-
-ccl_device_inline int4 operator<(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
-}
-
-ccl_device_inline int4 operator>=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
-}
-
-ccl_device_inline int4 operator<=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
-}
-
-ccl_device_inline bool operator==(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
-	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
-}
-
-ccl_device_inline float4 cross(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
-#else
-	return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
-#endif
-}
-
-ccl_device_inline bool is_zero(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	return a == make_float4(0.0f);
-#else
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
-#else
-	return ((a.x + a.y) + (a.z + a.w));
-#endif
-}
-
-ccl_device_inline float average(const float4& a)
-{
-	return reduce_add(a) * 0.25f;
-}
-
-ccl_device_inline float len(const float4& a)
-{
-	return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float4 normalize(const float4& a)
-{
-	return a/len(a);
-}
-
-ccl_device_inline float4 safe_normalize(const float4& a)
-{
-	float t = len(a);
-	return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline float4 min(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_min_ps(a.m128, b.m128);
-#else
-	return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline float4 max(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_max_ps(a.m128, b.m128);
-#else
-	return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
-#else
-	return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
-
-ccl_device_inline float4 reduce_min(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = min(shuffle<1,0,3,2>(a), a);
-	return min(shuffle<2,3,0,1>(h), h);
-#else
-	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-#endif
-}
-
-ccl_device_inline float4 reduce_max(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = max(shuffle<1,0,3,2>(a), a);
-	return max(shuffle<2,3,0,1>(h), h);
-#else
-	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-#endif
-}
-
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
-#else
-	return make_float4((a.x + a.y) + (a.z + a.w));
-#endif
-}
-#endif
-
-ccl_device_inline void print_float4(const char *label, const float4& a)
-{
-	printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
-}
-
-#endif
-
-/* Int2 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x + b.x, a.y + b.y);
-}
-
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x - b.x, a.y - b.y);
-}
-
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x * b.x, a.y * b.y);
-}
-
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x / b.x, a.y / b.y);
-}
-
-#endif
-
-/* Int3 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int3 min(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_min_epi32(a.m128, b.m128);
-#else
-	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 max(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_max_epi32(a.m128, b.m128);
-#else
-	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
-{
-#ifdef __KERNEL_SSE__
-	return min(max(a, make_int3(mn)), make_int3(mx));
-#else
-	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
-{
-#ifdef __KERNEL_SSE__
-	return min(max(a, mn), make_int3(mx));
-#else
-	return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_int3(const char *label, const int3& a)
-{
-	printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
-}
-
-#endif
-
-/* Int4 */
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline int4 operator+(const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_add_epi32(a.m128, b.m128);
-#else
-	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline int4 operator+=(int4& a, const int4& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline int4 operator>>(const int4& a, int i)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_srai_epi32(a.m128, i);
-#else
-	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
-#endif
-}
-
-ccl_device_inline int4 min(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_min_epi32(a.m128, b.m128);
-#else
-	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 max(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_max_epi32(a.m128, b.m128);
-#else
-	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
-{
-	return min(max(a, mn), mx);
-}
+CCL_NAMESPACE_END
 
-ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
-	__m128 m = _mm_cvtepi32_ps(mask);
-	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
-#else
-	return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
+#include "util/util_math_int2.h"
+#include "util/util_math_int3.h"
+#include "util/util_math_int4.h"
 
-ccl_device_inline void print_int4(const char *label, const int4& a)
-{
-	printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
-}
+#include "util/util_math_float2.h"
+#include "util/util_math_float3.h"
+#include "util/util_math_float4.h"
 
-#endif
+#include "util/util_rect.h"
 
-/* Int/Float conversion */
+CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int as_int(uint i)
-{
-	union { uint ui; int i; } u;
-	u.ui = i;
-	return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
-	union { uint ui; int i; } u;
-	u.i = i;
-	return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline int __float_as_int(float f)
-{
-	union { int i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
-	union { int i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
-	union { uint i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-	unsigned int x = __float_as_uint(f);
-	return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-	/* By IEEE 754 rule, 2*Inf equals Inf */
-	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f));
-}
-
 /* Interpolation */
 
 template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -1262,15 +332,23 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
 	return (A)(a * ((B)1 - t) + b * t);
 }
 
+#endif  /* __KERNEL_OPENCL__ */
+
 /* Triangle */
 
-ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float triangle_area(const float3& v1,
+                                      const float3& v2,
+                                      const float3& v3)
+#else
+ccl_device_inline float triangle_area(const float3 v1,
+                                      const float3 v2,
+                                      const float3 v3)
+#endif
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }
 
-#endif
-
 /* Orthonormal vectors */
 
 ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
@@ -1329,7 +407,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
 	y = (b.y != 0.0f)? a.y/b.y: 0.0f;
 	z = (b.z != 0.0f)? a.z/b.z: 0.0f;
 
-	/* try to get grey even if b is zero */
+	/* try to get gray even if b is zero */
 	if(b.x == 0.0f) {
 		if(b.y == 0.0f) {
 			x = z;
@@ -1366,16 +444,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
 	float3 r;
 
 	r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
-		(((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
-		(((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
+	      (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
+	      (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
 
 	r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
-		((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
-		(((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
+	      ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
+	     (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
 
 	r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
-		(((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
-		((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
+	      (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
+	      ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
 
 	return r;
 }
@@ -1424,22 +502,27 @@ ccl_device float safe_powf(float a, float b)
 	return compatible_powf(a, b);
 }
 
+ccl_device float safe_divide(float a, float b)
+{
+	return (b != 0.0f)? a/b: 0.0f;
+}
+
 ccl_device float safe_logf(float a, float b)
 {
-	if(UNLIKELY(a < 0.0f || b < 0.0f))
+	if(UNLIKELY(a <= 0.0f || b <= 0.0f))
 		return 0.0f;
 
-	return logf(a)/logf(b);
+	return safe_divide(logf(a),logf(b));
 }
 
-ccl_device float safe_divide(float a, float b)
+ccl_device float safe_modulo(float a, float b)
 {
-	return (b != 0.0f)? a/b: 0.0f;
+	return (b != 0.0f)? fmodf(a, b): 0.0f;
 }
 
-ccl_device float safe_modulo(float a, float b)
+ccl_device_inline float sqr(float a)
 {
-	return (b != 0.0f)? fmodf(a, b): 0.0f;
+	return a * a;
 }
 
 ccl_device_inline float beta(float x, float y)
@@ -1451,181 +534,9 @@ ccl_device_inline float beta(float x, float y)
 #endif
 }
 
-/* Ray Intersection */
-
-ccl_device bool ray_sphere_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 sphere_P, float sphere_radius,
-	float3 *isect_P, float *isect_t)
-{
-	float3 d = sphere_P - ray_P;
-	float radiussq = sphere_radius*sphere_radius;
-	float tsq = dot(d, d);
-
-	if(tsq > radiussq) { /* ray origin outside sphere */
-		float tp = dot(d, ray_D);
-
-		if(tp < 0.0f) /* dir points away from sphere */
-			return false;
-
-		float dsq = tsq - tp*tp; /* pythagoras */
-
-		if(dsq > radiussq) /* closest point on ray outside sphere */
-			return false;
-
-		float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-
-		if(t < ray_t) {
-			*isect_t = t;
-			*isect_P = ray_P + ray_D*t;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-ccl_device bool ray_aligned_disk_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 disk_P, float disk_radius,
-	float3 *isect_P, float *isect_t)
-{
-	/* aligned disk normal */
-	float disk_t;
-	float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
-	float div = dot(ray_D, disk_N);
-
-	if(UNLIKELY(div == 0.0f))
-		return false;
-
-	/* compute t to intersection point */
-	float t = -disk_t/div;
-	if(t < 0.0f || t > ray_t)
-		return false;
-	
-	/* test if within radius */
-	float3 P = ray_P + ray_D*t;
-	if(len_squared(P - disk_P) > disk_radius*disk_radius)
-		return false;
-
-	*isect_P = P;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_triangle_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 v0, float3 v1, float3 v2,
-	float3 *isect_P, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_t = t;
-	*isect_P = ray_P + ray_D*t;
-
-	return true;
-}
-
-ccl_device_inline bool ray_triangle_intersect_uv(
-        float3 ray_P, float3 ray_D, float ray_t,
-        float3 v0, float3 v1, float3 v2,
-        float *isect_u, float *isect_v, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_u = u;
-	*isect_v = v;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt,
-                                   float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n,
-                                   float3 *isect_P, float *isect_t, float *isect_u, float *isect_v)
+ccl_device_inline float xor_signmask(float x, int y)
 {
-	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-	if(t < ray_mint || t > ray_maxt)
-		return false;
-
-	float3 hit = ray_P + t*ray_D;
-	float3 inplane = hit - quad_P;
-
-	float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
-	if(u < 0.0f || u > 1.0f)
-		return false;
-
-	float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
-	if(v < 0.0f || v > 1.0f)
-		return false;
-
-	if(isect_P) *isect_P = hit;
-	if(isect_t) *isect_t = t;
-	if(isect_u) *isect_u = u;
-	if(isect_v) *isect_v = v;
-
-	return true;
+	return __int_as_float(__float_as_int(x) ^ y);
 }
 
 /* projections */
@@ -1662,32 +573,6 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
 	return make_float2(u, v);
 }
 
-ccl_device_inline int util_max_axis(float3 vec)
-{
-#ifdef __KERNEL_SSE__
-	__m128 a = shuffle<0,0,1,1>(vec.m128);
-	__m128 b = shuffle<1,2,2,1>(vec.m128);
-	__m128 c = _mm_cmpgt_ps(a, b);
-	int mask = _mm_movemask_ps(c) & 0x7;
-	static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
-	return tab[mask];
-#else
-	if(vec.x > vec.y) {
-		if(vec.x > vec.z)
-			return 0;
-		else
-			return 2;
-	}
-	else {
-		if(vec.y > vec.z)
-			return 1;
-		else
-			return 2;
-	}
-#endif
-}
-
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
-
diff --git a/intern/cycles/util/util_math_cdf.cpp b/intern/cycles/util/util_math_cdf.cpp
index ec78ca15d88..c14d4793ea1 100644
--- a/intern/cycles/util/util_math_cdf.cpp
+++ b/intern/cycles/util/util_math_cdf.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_math_cdf.h"
+#include "util/util_math_cdf.h"
 
-#include "util_algorithm.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h
index 47dfb68ba44..79643fe26e3 100644
--- a/intern/cycles/util/util_math_cdf.h
+++ b/intern/cycles/util/util_math_cdf.h
@@ -17,9 +17,9 @@
 #ifndef __UTIL_MATH_CDF_H__
 #define __UTIL_MATH_CDF_H__
 
-#include "util_algorithm.h"
-#include "util_math.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
new file mode 100644
index 00000000000..6f9d0855d50
--- /dev/null
+++ b/intern/cycles/util/util_math_float2.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT2_H__
+#define __UTIL_MATH_FLOAT2_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a);
+ccl_device_inline float2 operator*(const float2& a, const float2& b);
+ccl_device_inline float2 operator*(const float2& a, float f);
+ccl_device_inline float2 operator*(float f, const float2& a);
+ccl_device_inline float2 operator/(float f, const float2& a);
+ccl_device_inline float2 operator/(const float2& a, float f);
+ccl_device_inline float2 operator/(const float2& a, const float2& b);
+ccl_device_inline float2 operator+(const float2& a, const float2& b);
+ccl_device_inline float2 operator-(const float2& a, const float2& b);
+ccl_device_inline float2 operator+=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, float f);
+ccl_device_inline float2 operator/=(float2& a, const float2& b);
+ccl_device_inline float2 operator/=(float2& a, float f);
+
+ccl_device_inline bool operator==(const float2& a, const float2& b);
+ccl_device_inline bool operator!=(const float2& a, const float2& b);
+
+ccl_device_inline bool is_zero(const float2& a);
+ccl_device_inline float average(const float2& a);
+ccl_device_inline float dot(const float2& a, const float2& b);
+ccl_device_inline float cross(const float2& a, const float2& b);
+ccl_device_inline float len(const float2& a);
+ccl_device_inline float2 normalize(const float2& a);
+ccl_device_inline float2 normalize_len(const float2& a, float *t);
+ccl_device_inline float2 safe_normalize(const float2& a);
+ccl_device_inline float2 min(const float2& a, const float2& b);
+ccl_device_inline float2 max(const float2& a, const float2& b);
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx);
+ccl_device_inline float2 fabs(const float2& a);
+ccl_device_inline float2 as_float2(const float4& a);
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a)
+{
+	return make_float2(-a.x, -a.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
+{
+	return make_float2(a.x*b.x, a.y*b.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, float f)
+{
+	return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator*(float f, const float2& a)
+{
+	return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator/(float f, const float2& a)
+{
+	return make_float2(f/a.x, f/a.y);
+}
+
+ccl_device_inline float2 operator/(const float2& a, float f)
+{
+	float invf = 1.0f/f;
+	return make_float2(a.x*invf, a.y*invf);
+}
+
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
+{
+	return make_float2(a.x/b.x, a.y/b.y);
+}
+
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
+{
+	return make_float2(a.x+b.x, a.y+b.y);
+}
+
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
+{
+	return make_float2(a.x-b.x, a.y-b.y);
+}
+
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, float f)
+{
+	return a = a * f;
+}
+
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
+{
+	return a = a / b;
+}
+
+ccl_device_inline float2 operator/=(float2& a, float f)
+{
+	float invf = 1.0f/f;
+	return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float2& a, const float2& b)
+{
+	return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
+{
+	return !(a == b);
+}
+
+ccl_device_inline bool is_zero(const float2& a)
+{
+	return (a.x == 0.0f && a.y == 0.0f);
+}
+
+ccl_device_inline float average(const float2& a)
+{
+	return (a.x + a.y)*(1.0f/2.0f);
+}
+
+ccl_device_inline float dot(const float2& a, const float2& b)
+{
+	return a.x*b.x + a.y*b.y;
+}
+
+ccl_device_inline float cross(const float2& a, const float2& b)
+{
+	return (a.x*b.y - a.y*b.x);
+}
+
+ccl_device_inline float len(const float2& a)
+{
+	return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float2 normalize(const float2& a)
+{
+	return a/len(a);
+}
+
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
+{
+	*t = len(a);
+	return a/(*t);
+}
+
+ccl_device_inline float2 safe_normalize(const float2& a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float2 min(const float2& a, const float2& b)
+{
+	return make_float2(min(a.x, b.x), min(a.y, b.y));
+}
+
+ccl_device_inline float2 max(const float2& a, const float2& b)
+{
+	return make_float2(max(a.x, b.x), max(a.y, b.y));
+}
+
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline float2 fabs(const float2& a)
+{
+	return make_float2(fabsf(a.x), fabsf(a.y));
+}
+
+ccl_device_inline float2 as_float2(const float4& a)
+{
+	return make_float2(a.x, a.y);
+}
+
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
+{
+	return a + t*(b - a);
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
new file mode 100644
index 00000000000..f5149fe13ed
--- /dev/null
+++ b/intern/cycles/util/util_math_float3.h
@@ -0,0 +1,395 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT3_H__
+#define __UTIL_MATH_FLOAT3_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a);
+ccl_device_inline float3 operator*(const float3& a, const float3& b);
+ccl_device_inline float3 operator*(const float3& a, const float f);
+ccl_device_inline float3 operator*(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float3& a, const float f);
+ccl_device_inline float3 operator/(const float3& a, const float3& b);
+ccl_device_inline float3 operator+(const float3& a, const float3& b);
+ccl_device_inline float3 operator-(const float3& a, const float3& b);
+ccl_device_inline float3 operator+=(float3& a, const float3& b);
+ccl_device_inline float3 operator-=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, float f);
+ccl_device_inline float3 operator/=(float3& a, const float3& b);
+ccl_device_inline float3 operator/=(float3& a, float f);
+
+ccl_device_inline bool operator==(const float3& a, const float3& b);
+ccl_device_inline bool operator!=(const float3& a, const float3& b);
+
+ccl_device_inline float dot(const float3& a, const float3& b);
+ccl_device_inline float dot_xy(const float3& a, const float3& b);
+ccl_device_inline float3 cross(const float3& a, const float3& b);
+ccl_device_inline float3 normalize(const float3& a);
+ccl_device_inline float3 min(const float3& a, const float3& b);
+ccl_device_inline float3 max(const float3& a, const float3& b);
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx);
+ccl_device_inline float3 fabs(const float3& a);
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t);
+ccl_device_inline float3 rcp(const float3& a);
+ccl_device_inline float3 sqrt(const float3& a);
+#endif  /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float min3(float3 a);
+ccl_device_inline float max3(float3 a);
+ccl_device_inline float len(const float3 a);
+ccl_device_inline float len_squared(const float3 a);
+
+ccl_device_inline float3 saturate3(float3 a);
+ccl_device_inline float3 safe_normalize(const float3 a);
+ccl_device_inline float3 normalize_len(const float3 a, float *t);
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 interp(float3 a, float3 b, float t);
+
+ccl_device_inline bool is_zero(const float3 a);
+ccl_device_inline float reduce_add(const float3 a);
+ccl_device_inline float average(const float3 a);
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
+	return make_float3(-a.x, -a.y, -a.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
+	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float f)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
+	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float f, const float3& a)
+{
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
+#else
+	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float f, const float3& a)
+{
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float f)
+{
+	float invf = 1.0f/f;
+	return a * invf;
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
+}
+
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float3 operator-=(float3& a, const float3& b)
+{
+	return a = a - b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, float f)
+{
+	return a = a * f;
+}
+
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
+{
+	return a = a / b;
+}
+
+ccl_device_inline float3 operator/=(float3& a, float f)
+{
+	float invf = 1.0f/f;
+	return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
+}
+
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
+{
+	return !(a == b);
+}
+
+ccl_device_inline float dot(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
+#else
+	return a.x*b.x + a.y*b.y + a.z*b.z;
+#endif
+}
+
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
+ccl_device_inline float3 cross(const float3& a, const float3& b)
+{
+	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+	return r;
+}
+
+ccl_device_inline float3 normalize(const float3& a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
+	return float3(_mm_div_ps(a.m128, norm));
+#else
+	return a/len(a);
+#endif
+}
+
+ccl_device_inline float3 min(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_min_ps(a.m128, b.m128));
+#else
+	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 max(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_max_ps(a.m128, b.m128));
+#else
+	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline float3 fabs(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+	return float3(_mm_and_ps(a.m128, mask));
+#else
+	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
+}
+
+ccl_device_inline float3 sqrt(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sqrt_ps(a));
+#else
+	return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
+#endif
+}
+
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
+{
+	return a + t*(b - a);
+}
+
+ccl_device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+#else
+	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float min3(float3 a)
+{
+	return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float max3(float3 a)
+{
+	return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+	return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+	return dot(a, a);
+}
+
+ccl_device_inline float3 saturate3(float3 a)
+{
+	return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+}
+
+ccl_device_inline float3 normalize_len(const float3 a, float *t)
+{
+	*t = len(a);
+	float x = 1.0f / *t;
+	return a*x;
+}
+
+ccl_device_inline float3 safe_normalize(const float3 a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a * (1.0f/t) : a;
+}
+
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+{
+	*t = len(a);
+	return (*t != 0.0f)? a/(*t): a;
+}
+
+ccl_device_inline float3 interp(float3 a, float3 b, float t)
+{
+	return a + t*(b - a);
+}
+
+ccl_device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float3(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+ccl_device_inline float reduce_add(const float3 a)
+{
+	return (a.x + a.y + a.z);
+}
+
+ccl_device_inline float average(const float3 a)
+{
+	return reduce_add(a)*(1.0f/3.0f);
+}
+
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+{
+#ifdef __KERNEL_OPENCL__
+	return all(a == b);
+#else
+	return a == b;
+#endif
+}
+
+ccl_device_inline bool isfinite3_safe(float3 v)
+{
+	return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
+}
+
+ccl_device_inline float3 ensure_finite3(float3 v)
+{
+	if(!isfinite_safe(v.x)) v.x = 0.0f;
+	if(!isfinite_safe(v.y)) v.y = 0.0f;
+	if(!isfinite_safe(v.z)) v.z = 0.0f;
+	return v;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
new file mode 100644
index 00000000000..aa7e56fefe9
--- /dev/null
+++ b/intern/cycles/util/util_math_float4.h
@@ -0,0 +1,448 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT4_H__
+#define __UTIL_MATH_FLOAT4_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a);
+ccl_device_inline float4 operator*(const float4& a, const float4& b);
+ccl_device_inline float4 operator*(const float4& a, float f);
+ccl_device_inline float4 operator*(float f, const float4& a);
+ccl_device_inline float4 operator/(const float4& a, float f);
+ccl_device_inline float4 operator/(const float4& a, const float4& b);
+ccl_device_inline float4 operator+(const float4& a, const float4& b);
+ccl_device_inline float4 operator-(const float4& a, const float4& b);
+ccl_device_inline float4 operator+=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator/=(float4& a, float f);
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b);
+ccl_device_inline int4 operator>=(const float4& a, const float4& b);
+ccl_device_inline int4 operator<=(const float4& a, const float4& b);
+ccl_device_inline bool operator==(const float4& a, const float4& b);
+
+ccl_device_inline float dot(const float4& a, const float4& b);
+ccl_device_inline float len_squared(const float4& a);
+ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
+ccl_device_inline float4 cross(const float4& a, const float4& b);
+ccl_device_inline bool is_zero(const float4& a);
+ccl_device_inline float average(const float4& a);
+ccl_device_inline float len(const float4& a);
+ccl_device_inline float4 normalize(const float4& a);
+ccl_device_inline float4 safe_normalize(const float4& a);
+ccl_device_inline float4 min(const float4& a, const float4& b);
+ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
+#endif  /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
+#  endif
+#endif  /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+                                const float4& a,
+                                const float4& b);
+ccl_device_inline float4 reduce_min(const float4& a);
+ccl_device_inline float4 reduce_max(const float4& a);
+ccl_device_inline float4 reduce_add(const float4& a);
+#endif  /* !__KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+	return float4(_mm_xor_ps(a.m128, mask));
+#else
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_mul_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, float f)
+{
+#if defined(__KERNEL_SSE__)
+	return a * make_float4(f);
+#else
+	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
+}
+
+ccl_device_inline float4 operator*(float f, const float4& a)
+{
+	return a * f;
+}
+
+ccl_device_inline float4 operator/(const float4& a, float f)
+{
+	return a * (1.0f/f);
+}
+
+ccl_device_inline float4 operator/(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_div_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
+}
+
+ccl_device_inline float4 operator+(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline float4 operator-(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
+}
+
+ccl_device_inline float4 operator+=(float4& a, const float4& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float4 operator*=(float4& a, const float4& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float4 operator/=(float4& a, float f)
+{
+	return a = a / f;
+}
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+ccl_device_inline bool operator==(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
+}
+
+ccl_device_inline float dot(const float4& a, const float4& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#else
+	return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
+#endif
+}
+
+ccl_device_inline float len_squared(const float4& a)
+{
+	return dot(a, a);
+}
+
+ccl_device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+#else
+	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
+}
+
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sqrt_ps(a.m128));
+#else
+	return make_float4(sqrtf(a.x),
+	                   sqrtf(a.y),
+	                   sqrtf(a.z),
+	                   sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+	return a * a;
+}
+
+ccl_device_inline float4 cross(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) -
+	       (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+	return make_float4(a.y*b.z - a.z*b.y,
+	                   a.z*b.x - a.x*b.z,
+	                   a.x*b.y - a.y*b.x,
+	                   0.0f);
+#endif
+}
+
+ccl_device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float4(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+ccl_device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
+#  else
+	float4 h(shuffle<1,0,3,2>(a) + a);
+	return  shuffle<2,3,0,1>(h) + h;
+#  endif
+#else
+	float sum = (a.x + a.y) + (a.z + a.w);
+	return make_float4(sum, sum, sum, sum);
+#endif
+}
+
+ccl_device_inline float average(const float4& a)
+{
+	return reduce_add(a).x * 0.25f;
+}
+
+ccl_device_inline float len(const float4& a)
+{
+	return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float4 normalize(const float4& a)
+{
+	return a/len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4& a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float4 min(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_min_ps(a.m128, b.m128));
+#else
+	return make_float4(min(a.x, b.x),
+	                   min(a.y, b.y),
+	                   min(a.z, b.z),
+	                   min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline float4 max(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_max_ps(a.m128, b.m128));
+#else
+	return make_float4(max(a.x, b.x),
+	                   max(a.y, b.y),
+	                   max(a.z, b.z),
+	                   max(a.w, b.w));
+#endif
+}
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+	return make_float4(fabsf(a.x),
+	                   fabsf(a.y),
+	                   fabsf(a.z),
+	                   fabsf(a.w));
+#endif
+}
+#endif  /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b)
+{
+	return float4(_mm_castsi128_ps(
+	        _mm_shuffle_epi32(_mm_castps_si128(b),
+	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+}
+
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+	return float4(_mm_shuffle_ps(a.m128, b.m128,
+	                             _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+	return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+	return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
+{
+	return float4(_mm_moveldup_ps(b));
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+	return float4(_mm_movehdup_ps(b));
+}
+#  endif  /* __KERNEL_SSE3__ */
+#endif  /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+                                const float4& a,
+                                const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+#else
+	return make_float4((mask.x)? a.x: b.x,
+	                   (mask.y)? a.y: b.y,
+	                   (mask.z)? a.z: b.z,
+	                   (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+	/* Replace elements of x with zero where mask isn't set. */
+	return select(mask, a, make_float4(0.0f));
+}
+
+ccl_device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = min(shuffle<1,0,3,2>(a), a);
+	return min(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+ccl_device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = max(shuffle<1,0,3,2>(a), a);
+	return max(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+ccl_device_inline float4 load_float4(const float *v)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_loadu_ps(v));
+#else
+	return make_float4(v[0], v[1], v[2], v[3]);
+#endif
+}
+
+#endif  /* !__KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
new file mode 100644
index 00000000000..828c49a131c
--- /dev/null
+++ b/intern/cycles/util/util_math_int2.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT2_H__
+#define __UTIL_MATH_INT2_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b);
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b)
+{
+	return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x + b.x, a.y + b.y);
+}
+
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x - b.x, a.y - b.y);
+}
+
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x * b.x, a.y * b.y);
+}
+
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x / b.x, a.y / b.y);
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT2_H__ */
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
new file mode 100644
index 00000000000..6eef8517665
--- /dev/null
+++ b/intern/cycles/util/util_math_int3.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT3_H__
+#define __UTIL_MATH_INT3_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b);
+ccl_device_inline int3 max(int3 a, int3 b);
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx);
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int3(_mm_min_epi32(a.m128, b.m128));
+#else
+	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 max(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int3(_mm_max_epi32(a.m128, b.m128));
+#else
+	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+	return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+	return min(max(a, mn), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn.x, mx),
+	                 clamp(a.y, mn.y, mx),
+	                 clamp(a.z, mn.z, mx));
+#endif
+}
+
+ccl_device_inline bool operator==(const int3 &a, const int3 &b)
+{
+	return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+ccl_device_inline bool operator!=(const int3 &a, const int3 &b)
+{
+	return !(a == b);
+}
+
+ccl_device_inline bool operator<(const int3 &a, const int3 &b)
+{
+	return a.x < b.x && a.y < b.y && a.z < b.z;
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
new file mode 100644
index 00000000000..79a8c0841e7
--- /dev/null
+++ b/intern/cycles/util/util_math_int4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT4_H__
+#define __UTIL_MATH_INT4_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b);
+ccl_device_inline int4 operator+=(int4& a, const int4& b);
+ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 min(int4 a, int4 b);
+ccl_device_inline int4 max(int4 a, int4 b);
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b);
+#endif  /* __KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_add_epi32(a.m128, b.m128));
+#else
+	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline int4 operator+=(int4& a, const int4& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_srai_epi32(a.m128, i));
+#else
+	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
+#endif
+}
+
+ccl_device_inline int4 min(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int4(_mm_min_epi32(a.m128, b.m128));
+#else
+	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 max(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int4(_mm_max_epi32(a.m128, b.m128));
+#else
+	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	const __m128 m = _mm_cvtepi32_ps(mask);
+	/* TODO(sergey): avoid cvt. */
+	return int4(_mm_castps_si128(
+	        _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)),
+	                  _mm_andnot_ps(m, _mm_castsi128_ps(b)))));
+#else
+	return make_int4((mask.x)? a.x: b.x,
+	                 (mask.y)? a.y: b.y,
+	                 (mask.z)? a.z: b.z,
+	                 (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline int4 load_int4(const int *v)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_loadu_si128((__m128i*)v));
+#else
+	return make_int4(v[0], v[1], v[2], v[3]);
+#endif
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
new file mode 100644
index 00000000000..61ddcc38f50
--- /dev/null
+++ b/intern/cycles/util/util_math_intersect.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INTERSECT_H__
+#define __UTIL_MATH_INTERSECT_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray Intersection */
+
+ccl_device bool ray_sphere_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 sphere_P, float sphere_radius,
+        float3 *isect_P, float *isect_t)
+{
+	const float3 d = sphere_P - ray_P;
+	const float radiussq = sphere_radius*sphere_radius;
+	const float tsq = dot(d, d);
+
+	if(tsq > radiussq) {
+		/* Ray origin outside sphere. */
+		const float tp = dot(d, ray_D);
+		if(tp < 0.0f) {
+			/* Ray  points away from sphere. */
+			return false;
+		}
+		const float dsq = tsq - tp*tp;  /* pythagoras */
+		if(dsq > radiussq)  {
+			/* Closest point on ray outside sphere. */
+			return false;
+		}
+		const float t = tp - sqrtf(radiussq - dsq);  /* pythagoras */
+		if(t < ray_t) {
+			*isect_t = t;
+			*isect_P = ray_P + ray_D*t;
+			return true;
+		}
+	}
+	return false;
+}
+
+ccl_device bool ray_aligned_disk_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 disk_P, float disk_radius,
+        float3 *isect_P, float *isect_t)
+{
+	/* Aligned disk normal. */
+	float disk_t;
+	const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
+	const float div = dot(ray_D, disk_N);
+	if(UNLIKELY(div == 0.0f)) {
+		return false;
+	}
+	/* Compute t to intersection point. */
+	const float t = -disk_t/div;
+	if(t < 0.0f || t > ray_t) {
+		return false;
+	}
+	/* Test if within radius. */
+	float3 P = ray_P + ray_D*t;
+	if(len_squared(P - disk_P) > disk_radius*disk_radius) {
+		return false;
+	}
+	*isect_P = P;
+	*isect_t = t;
+	return true;
+}
+
+ccl_device_forceinline bool ray_triangle_intersect(
+        float3 ray_P, float3 ray_dir, float ray_t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+        const ssef *ssef_verts,
+#else
+        const float3 tri_a, const float3 tri_b, const float3 tri_c,
+#endif
+        float *isect_u, float *isect_v, float *isect_t)
+{
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	typedef ssef float3;
+	const float3 tri_a(ssef_verts[0]);
+	const float3 tri_b(ssef_verts[1]);
+	const float3 tri_c(ssef_verts[2]);
+	const float3 P(ray_P);
+	const float3 dir(ray_dir);
+#else
+#  define dot3(a, b) dot(a, b)
+	const float3 P = ray_P;
+	const float3 dir = ray_dir;
+#endif
+
+	/* Calculate vertices relative to ray origin. */
+	const float3 v0 = tri_c - P;
+	const float3 v1 = tri_a - P;
+	const float3 v2 = tri_b - P;
+
+	/* Calculate triangle edges. */
+	const float3 e0 = v2 - v0;
+	const float3 e1 = v0 - v1;
+	const float3 e2 = v1 - v2;
+
+	/* Perform edge tests. */
+#if defined(__KERNEL_SSE2__)  && defined (__KERNEL_SSE__)
+	const float3 crossU = cross(v2 + v0, e0);
+	const float3 crossV = cross(v0 + v1, e1);
+	const float3 crossW = cross(v1 + v2, e2);
+
+	ssef crossX(crossU);
+	ssef crossY(crossV);
+	ssef crossZ(crossW);
+	ssef zero = _mm_setzero_ps();
+	_MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero);
+
+	const ssef dirX(ray_dir.x);
+	const ssef dirY(ray_dir.y);
+	const ssef dirZ(ray_dir.z);
+
+	ssef UVWW = madd(crossX, dirX, madd(crossY, dirY, crossZ * dirZ));
+#else  /* __KERNEL_SSE2__ */
+	const float U = dot(cross(v2 + v0, e0), ray_dir);
+	const float V = dot(cross(v0 + v1, e1), ray_dir);
+	const float W = dot(cross(v1 + v2, e2), ray_dir);
+#endif  /* __KERNEL_SSE2__ */
+
+#if defined(__KERNEL_SSE2__)  && defined (__KERNEL_SSE__)
+	int uvw_sign = movemask(UVWW) & 0x7;
+	if (uvw_sign != 0)
+	{
+		if (uvw_sign != 0x7)
+			return false;
+	}
+#else
+	const float minUVW = min(U, min(V, W));
+	const float maxUVW = max(U, max(V, W));
+
+	if(minUVW < 0.0f && maxUVW > 0.0f) {
+		return false;
+	}
+#endif
+
+
+	/* Calculate geometry normal and denominator. */
+	const float3 Ng1 = cross(e1, e0);
+	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+	const float3 Ng = Ng1 + Ng1;
+	const float den = dot3(Ng, dir);
+	/* Avoid division by 0. */
+	if(UNLIKELY(den == 0.0f)) {
+		return false;
+	}
+
+	/* Perform depth test. */
+	const float T = dot3(v0, Ng);
+	const int sign_den = (__float_as_int(den) & 0x80000000);
+	const float sign_T = xor_signmask(T, sign_den);
+	if((sign_T < 0.0f) ||
+	   (sign_T > ray_t * xor_signmask(den, sign_den)))
+	{
+		return false;
+	}
+
+	const float inv_den = 1.0f / den;
+#if defined(__KERNEL_SSE2__)  && defined (__KERNEL_SSE__)
+	UVWW *= inv_den;
+	_mm_store_ss(isect_u, UVWW);
+	_mm_store_ss(isect_v, shuffle<1,1,3,3>(UVWW));
+#else
+	*isect_u = U * inv_den;
+	*isect_v = V * inv_den;
+#endif
+	*isect_t = T * inv_den;
+	return true;
+
+#undef dot3
+}
+
+ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
+                                   float ray_mint, float ray_maxt,
+                                   float3 quad_P,
+                                   float3 quad_u, float3 quad_v, float3 quad_n,
+                                   float3 *isect_P, float *isect_t,
+                                   float *isect_u, float *isect_v)
+{
+	/* Perform intersection test. */
+	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
+	if(t < ray_mint || t > ray_maxt) {
+		return false;
+	}
+	const float3 hit = ray_P + t*ray_D;
+	const float3 inplane = hit - quad_P;
+	const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
+	if(u < 0.0f || u > 1.0f) {
+		return false;
+	}
+	const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
+	if(v < 0.0f || v > 1.0f) {
+		return false;
+	}
+	/* Store the result. */
+	/* TODO(sergey): Check whether we can avoid some checks here. */
+	if(isect_P != NULL) *isect_P = hit;
+	if(isect_t != NULL) *isect_t = t;
+	if(isect_u != NULL) *isect_u = u;
+	if(isect_v != NULL) *isect_v = v;
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INTERSECT_H__ */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
new file mode 100644
index 00000000000..382dad64ea5
--- /dev/null
+++ b/intern/cycles/util/util_math_matrix.h
@@ -0,0 +1,407 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_MATRIX_H__
+#define __UTIL_MATH_MATRIX_H__
+
+CCL_NAMESPACE_BEGIN
+
+#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#  define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#  define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+#  define VECS(V, i, s) V[(i)*(s)]
+#else
+#  define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#  define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+#  define VECS(V, i, s) V[i]
+#endif
+
+/* Zeroing helpers. */
+
+ccl_device_inline void math_vector_zero(float *v, int n)
+{
+	for(int i = 0; i < n; i++) {
+		v[i] = 0.0f;
+	}
+}
+
+ccl_device_inline void math_matrix_zero(float *A, int n)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = 0.0f;
+		}
+	}
+}
+
+/* Elementary vector operations. */
+
+ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] += b[i];
+	}
+}
+
+ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] *= b[i];
+	}
+}
+
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i*astride] *= b[i];
+	}
+}
+
+ccl_device_inline void math_vector_scale(float *a, float b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] *= b;
+	}
+}
+
+ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] = max(a[i], b[i]);
+	}
+}
+
+ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+{
+	for(int i = 0; i < n; i++) {
+		v[i] += w*x[i];
+	}
+}
+
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
+{
+	for(int i = 0; i < n; i++) {
+		ccl_global float *elem = (ccl_global float*) (v + i*stride);
+		atomic_add_and_fetch_float(elem+0, w.x*x[i]);
+		atomic_add_and_fetch_float(elem+1, w.y*x[i]);
+		atomic_add_and_fetch_float(elem+2, w.z*x[i]);
+	}
+}
+
+/* Elementary matrix operations.
+ * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+{
+	for(int row = 0; row < n; row++) {
+		MATHS(A, row, row, stride) += val;
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
+                                                  int n,
+                                                  const float *ccl_restrict v,
+                                                  float weight)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) += v[row]*v[col]*weight;
+		}
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
+                                                          int n,
+                                                          const float *ccl_restrict v,
+                                                          float weight,
+                                                          int stride)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight);
+		}
+	}
+}
+
+/* Transpose matrix A inplace. */
+ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
+{
+	for(int i = 0; i < n; i++) {
+		for(int j = 0; j < i; j++) {
+			float temp = MATS(A, n, i, j, stride);
+			MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
+			MATS(A, n, j, i, stride) = temp;
+		}
+	}
+}
+
+/* Solvers for matrix problems */
+
+/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
+ * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
+ * Also, only the lower triangular part of A is ever accessed. */
+ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			float sum_col = MATHS(A, row, col, stride);
+			for(int k = 0; k < col; k++) {
+				sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
+			}
+			if(row == col) {
+				sum_col = sqrtf(max(sum_col, 0.0f));
+			}
+			else {
+				sum_col /= MATHS(A, col, col, stride);
+			}
+			MATHS(A, row, col, stride) = sum_col;
+		}
+	}
+}
+
+/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+ *
+ * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
+ * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
+ * Since L is lower triangular, finding b is relatively easy since y is known.
+ * Then, the remaining problem is Lt*S = b, which again can be solved easily.
+ *
+ * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
+ * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
+{
+	/* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
+	 * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction
+	 * is scaled based on it. */
+	math_trimatrix_add_diagonal(A, n, 3e-7f*A[0], stride); /* Improve the numerical stability. */
+	math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
+
+	/* Use forward substitution to solve L*b = y, replacing y by b. */
+	for(int row = 0; row < n; row++) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = 0; col < row; col++)
+			sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+
+	/* Use backward substitution to solve Lt*S = b, replacing b by S. */
+	for(int row = n-1; row >= 0; row--) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = row+1; col < n; col++)
+			sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+}
+
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ */
+ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride)
+{
+	const float singular_epsilon = 1e-9f;
+
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col < n; col++) {
+			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+		}
+	}
+
+	for(int sweep = 0; sweep < 8; sweep++) {
+		float off_diagonal = 0.0f;
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
+				off_diagonal += fabsf(MAT(A, n, row, col));
+			}
+		}
+		if(off_diagonal < 1e-7f) {
+			/* The matrix has nearly reached diagonal form.
+			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+			break;
+		}
+
+		/* Set the threshold for the small element rotation skip in the first sweep:
+		 * Skip all elements that are less than a tenth of the average off-diagonal element. */
+		float threshold = 0.2f*off_diagonal / (n*n);
+
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
+				/* Perform a Jacobi rotation on this element that reduces it to zero. */
+				float element = MAT(A, n, row, col);
+				float abs_element = fabsf(element);
+
+				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+				if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+					MAT(A, n, row, col) = 0.0f;
+					continue;
+				}
+
+				if(element == 0.0f) {
+					continue;
+				}
+
+				/* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+				if(sweep < 3 && (abs_element < threshold)) {
+					continue;
+				}
+
+				/* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
+				 * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+				 * Then, we compute sin(phi) and cos(phi) themselves. */
+				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
+				float ratio;
+				if(abs_element > singular_epsilon*fabsf(singular_diff)) {
+					float cot_2phi = 0.5f*singular_diff / element;
+					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
+					if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+				}
+				else {
+					ratio = element / singular_diff;
+				}
+
+				float c = 1.0f / sqrtf(1.0f + ratio*ratio);
+				float s = ratio*c;
+				/* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+				float tan_phi_2 = s / (1.0f + c);
+
+				/* Update the singular values in the diagonal. */
+				float singular_delta = ratio*element;
+				MAT(A, n, row, row) += singular_delta;
+				MAT(A, n, col, col) -= singular_delta;
+
+				/* Set the element itself to zero. */
+				MAT(A, n, row, col) = 0.0f;
+
+				/* Perform the actual rotations on the matrices. */
+#define ROT(M, r1, c1, r2, c2, stride)                                   \
+				{                                                        \
+					float M1 = MATS(M, n, r1, c1, stride);               \
+					float M2 = MATS(M, n, r2, c2, stride);               \
+					MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \
+					MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \
+				}
+
+				/* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+				for(int i = 0    ; i < col; i++) ROT(A, col, i, row, i, 1);
+				for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1);
+				for(int i = row+1; i < n  ; i++) ROT(A, i, col, i, row, 1);
+
+				for(int i = 0    ; i < n  ; i++) ROT(V, col, i, row, i, v_stride);
+#undef ROT
+			}
+		}
+	}
+
+	/* Sort eigenvalues and the associated eigenvectors. */
+	for(int i = 0; i < n - 1; i++) {
+		float v = MAT(A, n, i, i);
+		int k = i;
+		for(int j = i; j < n; j++) {
+			if(MAT(A, n, j, j) >= v) {
+				v = MAT(A, n, j, j);
+				k = j;
+			}
+		}
+		if(k != i) {
+			/* Swap eigenvalues. */
+			MAT(A, n, k, k) = MAT(A, n, i, i);
+			MAT(A, n, i, i) = v;
+			/* Swap eigenvectors. */
+			for(int j = 0; j < n; j++) {
+				float v = MATS(V, n, i, j, v_stride);
+				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
+				MATS(V, n, k, j, v_stride) = v;
+			}
+		}
+	}
+}
+
+#ifdef __KERNEL_SSE3__
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
+{
+	for(int i = 0; i < n; i++) {
+		A[i] = make_float4(0.0f);
+	}
+}
+
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = make_float4(0.0f);
+		}
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
+		}
+	}
+}
+
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
+{
+	for(int i = 0; i < n; i++) {
+		V[i] += a[i];
+	}
+}
+
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
+{
+	for(int i = 0; i < n; i++) {
+		V[i] *= a[i];
+	}
+}
+
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] = max(a[i], b[i]);
+	}
+}
+
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
+		}
+	}
+}
+#endif
+
+#undef MAT
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_MATH_MATRIX_H__ */
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index 19168135f01..749760d84f0 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -310,6 +310,13 @@ void MD5Hash::append(const uint8_t *data, int nbytes)
 		memcpy(buf, p, left);
 }
 
+void MD5Hash::append(const string& str)
+{
+	if(str.size()) {
+		append((const uint8_t*)str.c_str(), str.size());
+	}
+}
+
 bool MD5Hash::append_file(const string& filepath)
 {
 	FILE *f = path_fopen(filepath, "rb");
diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h
index d0af9fdb004..b043b591e67 100644
--- a/intern/cycles/util/util_md5.h
+++ b/intern/cycles/util/util_md5.h
@@ -30,8 +30,8 @@
 #ifndef __UTIL_MD5_H__
 #define __UTIL_MD5_H__
 
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -41,6 +41,7 @@ public:
 	~MD5Hash();
 
 	void append(const uint8_t *data, int size);
+	void append(const string& str);
 	bool append_file(const string& filepath);
 	string get_hex();
 
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index adc141a7b28..3c5785c4807 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -19,22 +19,15 @@
 
 #ifndef __KERNEL_GPU__
 
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)  || \
-	defined(__KERNEL_SSE3__)  || \
-	defined(__KERNEL_SSSE3__) || \
-	defined(__KERNEL_SSE41__) || \
-	defined(__KERNEL_AVX__)   || \
-	defined(__KERNEL_AVX2__)
-	/* do nothing */
-#endif
-
 /* x86
  *
  * Compile a regular, SSE2 and SSE3 kernel. */
 
 #if defined(i386) || defined(_M_IX86)
 
+/* We require minimum SSE2 support on x86, so auto enable. */
+#  define __KERNEL_SSE2__
+
 #  ifdef WITH_KERNEL_SSE2
 #    define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  endif
@@ -73,48 +66,6 @@
 
 #endif  /* defined(__x86_64__) || defined(_M_X64) */
 
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-#  define __KERNEL_SSE__
-#  ifndef __KERNEL_SSE2__
-#    define __KERNEL_SSE2__
-#  endif
-#  ifndef __KERNEL_SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifndef __KERNEL_SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifndef __KERNEL_SSE4__
-#    define __KERNEL_SSE4__
-#  endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-#  include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-#  include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util_windows.h"
-
-#endif
-
 #endif
 
 #endif /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 5df262fcbbb..0e0371928ab 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_string.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
 
 #include <OpenImageIO/filesystem.h>
 #include <OpenImageIO/strutil.h>
@@ -45,7 +44,8 @@ OIIO_NAMESPACE_USING
 #  include <shlwapi.h>
 #endif
 
-#include "util_windows.h"
+#include "util/util_map.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -320,17 +320,18 @@ static char *path_specials(const string& sub)
 {
 	static bool env_init = false;
 	static char *env_shader_path;
-	static char *env_kernel_path;
+	static char *env_source_path;
 	if(!env_init) {
 		env_shader_path = getenv("CYCLES_SHADER_PATH");
-		env_kernel_path = getenv("CYCLES_KERNEL_PATH");
+		/* NOTE: It is KERNEL in env variable for compatibility reasons. */
+		env_source_path = getenv("CYCLES_KERNEL_PATH");
 		env_init = true;
 	}
 	if(env_shader_path != NULL && sub == "shader") {
 		return env_shader_path;
 	}
-	else if(env_shader_path != NULL && sub == "kernel") {
-		return env_kernel_path;
+	else if(env_shader_path != NULL && sub == "source") {
+		return env_source_path;
 	}
 	return NULL;
 }
@@ -767,64 +768,195 @@ bool path_remove(const string& path)
 	return remove(path.c_str()) == 0;
 }
 
-static string line_directive(const string& path, int line)
-{
-	string escaped_path = path;
-	string_replace(escaped_path, "\"", "\\\"");
-	string_replace(escaped_path, "\'", "\\\'");
-	string_replace(escaped_path, "\?", "\\\?");
-	string_replace(escaped_path, "\\", "\\\\");
+struct SourceReplaceState {
+	typedef map<string, string> ProcessedMapping;
+	/* Base director for all relative include headers. */
+	string base;
+	/* Result of processed files. */
+	ProcessedMapping processed_files;
+	/* Set of files which are considered "precompiled" and which are replaced
+	 * with and empty string on a subsequent occurrence in include statement.
+	 */
+	set<string> precompiled_headers;
+};
+
+static string path_source_replace_includes_recursive(
+        const string& source,
+        const string& source_filepath,
+        SourceReplaceState *state);
+
+static string line_directive(const SourceReplaceState& state,
+                             const string& path,
+                             const int line)
+{
+	string unescaped_path = path;
+	/* First we make path relative. */
+	if(string_startswith(unescaped_path, state.base.c_str())) {
+		const string base_file = path_filename(state.base);
+		const size_t base_len = state.base.length();
+		unescaped_path = base_file +
+		        unescaped_path.substr(base_len,
+		                            unescaped_path.length() - base_len);
+	}
+	/* Second, we replace all unsafe characters. */
+	const size_t length = unescaped_path.length();
+	string escaped_path = "";
+	for(size_t i = 0; i < length; ++i) {
+		const char ch = unescaped_path[i];
+		if(strchr("\"\'\?\\", ch) != NULL) {
+			escaped_path += "\\";
+		}
+		escaped_path += ch;
+	}
+	/* TODO(sergey): Check whether using std::to_string combined with several
+	 * concatenation operations is any faster.
+	 */
 	return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
 }
 
+static string path_source_handle_preprocessor(
+        const string& preprocessor_line,
+        const string& source_filepath,
+        const size_t line_number,
+        SourceReplaceState *state)
+{
+	string result = preprocessor_line;
+	string token = string_strip(
+	        preprocessor_line.substr(1, preprocessor_line.size() - 1));
+	if(string_startswith(token, "include")) {
+		token = string_strip(token.substr(7, token.size() - 7));
+		if(token[0] == '"') {
+			const size_t n_start = 1;
+			const size_t n_end = token.find("\"", n_start);
+			const string filename = token.substr(n_start, n_end - n_start);
+			const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
+			string filepath = path_join(state->base, filename);
+			if(!path_exists(filepath)) {
+				filepath = path_join(path_dirname(source_filepath),
+				                     filename);
+			}
+			if(is_precompiled) {
+				state->precompiled_headers.insert(filepath);
+			}
+			string text;
+			if(path_read_text(filepath, text)) {
+				text = path_source_replace_includes_recursive(
+				        text, filepath, state);
+				/* Use line directives for better error messages. */
+				result = line_directive(*state, filepath, 1) + "\n"
+				     + text + "\n"
+				     + line_directive(*state, source_filepath, line_number + 1);
+			}
+		}
+	}
+	return result;
+}
 
-string path_source_replace_includes(const string& source,
-                                    const string& path,
-                                    const string& source_filename)
+/* Our own little c preprocessor that replaces #includes with the file
+ * contents, to work around issue of OpenCL drivers not supporting
+ * include paths with spaces in them.
+ */
+static string path_source_replace_includes_recursive(
+        const string& source,
+        const string& source_filepath,
+        SourceReplaceState *state)
 {
-	/* Our own little c preprocessor that replaces #includes with the file
-	 * contents, to work around issue of opencl drivers not supporting
-	 * include paths with spaces in them.
+	/* Try to re-use processed file without spending time on replacing all
+	 * include directives again.
 	 */
-
+	SourceReplaceState::ProcessedMapping::iterator replaced_file =
+	        state->processed_files.find(source_filepath);
+	if(replaced_file != state->processed_files.end()) {
+		if(state->precompiled_headers.find(source_filepath) !=
+		        state->precompiled_headers.end()) {
+			return "";
+		}
+		return replaced_file->second;
+	}
+	/* Perform full file processing. */
 	string result = "";
-	vector<string> lines;
-	string_split(lines, source, "\n", false);
-
-	for(size_t i = 0; i < lines.size(); ++i) {
-		string line = lines[i];
-		if(line[0] == '#') {
-			string token = string_strip(line.substr(1, line.size() - 1));
-			if(string_startswith(token, "include")) {
-				token = string_strip(token.substr(7, token.size() - 7));
-				if(token[0] == '"') {
-					size_t n_start = 1;
-					size_t n_end = token.find("\"", n_start);
-					string filename = token.substr(n_start, n_end - n_start);
-					string text, filepath = path_join(path, filename);
-					if(path_read_text(filepath, text)) {
-						/* Replace include directories with both current path
-						 * and path extracted from the include file.
-						 * Not totally robust, but works fine for Cycles kernel
-						 * and avoids having list of include directories.x
-						 */
-						text = path_source_replace_includes(
-						        text, path_dirname(filepath), filename);
-						text = path_source_replace_includes(text, path, filename);
-						/* Use line directives for better error messages. */
-						line = line_directive(filepath, 1)
-						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path_join(path, source_filename), i);
-					}
-				}
+	const size_t source_length = source.length();
+	size_t index = 0;
+	/* Information about where we are in the source. */
+	size_t line_number = 0, column_number = 1;
+	/* Currently gathered non-preprocessor token.
+	 * Store as start/length rather than token itself to avoid overhead of
+	 * memory re-allocations on each character concatenation.
+	 */
+	size_t token_start = 0, token_length = 0;
+	/* Denotes whether we're inside of preprocessor line, together with
+	 * preprocessor line itself.
+	 *
+	 * TODO(sergey): Investigate whether using token start/end position
+	 * gives measurable speedup.
+	 */
+	bool inside_preprocessor = false;
+	string preprocessor_line = "";
+	/* Actual loop over the whole source. */
+	while(index < source_length) {
+		const char ch = source[index];
+		if(ch == '\n') {
+			if(inside_preprocessor) {
+				result += path_source_handle_preprocessor(preprocessor_line,
+				                                          source_filepath,
+				                                          line_number,
+				                                          state);
+				/* Start gathering net part of the token. */
+				token_start = index;
+				token_length = 0;
+			}
+			inside_preprocessor = false;
+			preprocessor_line = "";
+			column_number = 0;
+			++line_number;
+		}
+		else if(ch == '#' && column_number == 1 && !inside_preprocessor) {
+			/* Append all possible non-preprocessor token to the result. */
+			if(token_length != 0) {
+				result.append(source, token_start, token_length);
+				token_start = index;
+				token_length = 0;
 			}
+			inside_preprocessor = true;
+		}
+		if(inside_preprocessor) {
+			preprocessor_line += ch;
 		}
-		result += line + "\n";
+		else {
+			++token_length;
+		}
+		++index;
+		++column_number;
 	}
-
+	/* Append possible tokens which happened before special events handled
+	 * above.
+	 */
+	if(token_length != 0) {
+		result.append(source, token_start, token_length);
+	}
+	if(inside_preprocessor) {
+		result += path_source_handle_preprocessor(preprocessor_line,
+		                                          source_filepath,
+		                                          line_number,
+		                                          state);
+	}
+	/* Store result for further reuse. */
+	state->processed_files[source_filepath] = result;
 	return result;
 }
 
+string path_source_replace_includes(const string& source,
+                                    const string& path,
+                                    const string& source_filename)
+{
+	SourceReplaceState state;
+	state.base = path;
+	return path_source_replace_includes_recursive(
+	        source,
+	        path_join(path, source_filename),
+	        &state);
+}
+
 FILE *path_fopen(const string& path, const string& mode)
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 70dbb5ae403..0e5e2d2c837 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -24,10 +24,10 @@
 
 #include <stdio.h>
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 14215056840..134383e88db 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -23,10 +23,10 @@
  * update notifications from a job running in another thread. All methods
  * except for the constructor/destructor are thread safe. */
 
-#include "util_function.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_thread.h"
+#include "util/util_function.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -37,9 +37,11 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -75,9 +77,11 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -144,6 +148,7 @@ public:
 		thread_scoped_lock lock(progress_mutex);
 
 		start_time = time_dt();
+		end_time = 0.0;
 	}
 
 	void set_render_start_time()
@@ -167,8 +172,15 @@ public:
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		total_time_ = time_dt() - start_time;
-		render_time_ = time_dt() - render_start_time;
+		double time = (end_time > 0) ? end_time : time_dt();
+
+		total_time_ = time - start_time;
+		render_time_ = time - render_start_time;
+	}
+
+	void set_end_time()
+	{
+		end_time = time_dt();
 	}
 
 	void reset_sample()
@@ -177,7 +189,8 @@ public:
 
 		pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 	}
 
 	void set_total_pixel_samples(uint64_t total_pixel_samples_)
@@ -209,23 +222,36 @@ public:
 		set_update();
 	}
 
-	void add_finished_tile()
+	void add_finished_tile(bool denoised)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		finished_tiles++;
+		if(denoised) {
+			denoised_tiles++;
+		}
+		else {
+			rendered_tiles++;
+		}
 	}
 
 	int get_current_sample()
 	{
+		thread_scoped_lock lock(progress_mutex);
 		/* Note that the value here always belongs to the last tile that updated,
 		 * so it's only useful if there is only one active tile. */
 		return current_tile_sample;
 	}
 
-	int get_finished_tiles()
+	int get_rendered_tiles()
+	{
+		thread_scoped_lock lock(progress_mutex);
+		return rendered_tiles;
+	}
+
+	int get_denoised_tiles()
 	{
-		return finished_tiles;
+		thread_scoped_lock lock(progress_mutex);
+		return denoised_tiles;
 	}
 
 	/* status messages */
@@ -318,9 +344,11 @@ protected:
 	int current_tile_sample;
 	/* Stores the number of tiles that's already finished.
 	 * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
-	int finished_tiles;
+	int rendered_tiles, denoised_tiles;
 
 	double start_time, render_start_time;
+	/* End time written when render is done, so it doesn't keep increasing on redraws. */
+	double end_time;
 
 	string status;
 	string substatus;
diff --git a/intern/cycles/util/util_projection.h b/intern/cycles/util/util_projection.h
new file mode 100644
index 00000000000..dbcb9877a48
--- /dev/null
+++ b/intern/cycles/util/util_projection.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_PROJECTION_H__
+#define __UTIL_PROJECTION_H__
+
+#include "util/util_transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* 4x4 projection matrix, perspective or orthographic. */
+
+typedef struct ProjectionTransform {
+	float4 x, y, z, w; /* rows */
+
+#ifndef __KERNEL_GPU__
+	ProjectionTransform()
+	{
+	}
+
+	explicit ProjectionTransform(const Transform& tfm)
+	: x(tfm.x),
+	  y(tfm.y),
+	  z(tfm.z),
+	  w(make_float4(0.0f, 0.0f, 0.0f, 1.0f))
+	{
+	}
+#endif
+} ProjectionTransform;
+
+typedef struct PerspectiveMotionTransform {
+	ProjectionTransform pre;
+	ProjectionTransform post;
+} PerspectiveMotionTransform;
+
+/* Functions */
+
+ccl_device_inline float3 transform_perspective(const ProjectionTransform *t, const float3 a)
+{
+	float4 b = make_float4(a.x, a.y, a.z, 1.0f);
+	float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
+	float w = dot(t->w, b);
+
+	return (w != 0.0f)? c/w: make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device_inline float3 transform_perspective_direction(const ProjectionTransform *t, const float3 a)
+{
+	float3 c = make_float3(
+		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z,
+		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z,
+		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z);
+
+	return c;
+}
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform projection_to_transform(const ProjectionTransform& a)
+{
+	Transform tfm = {a.x, a.y, a.z};
+	return tfm;
+}
+
+ccl_device_inline ProjectionTransform projection_transpose(const ProjectionTransform& a)
+{
+	ProjectionTransform t;
+
+	t.x.x = a.x.x; t.x.y = a.y.x; t.x.z = a.z.x; t.x.w = a.w.x;
+	t.y.x = a.x.y; t.y.y = a.y.y; t.y.z = a.z.y; t.y.w = a.w.y;
+	t.z.x = a.x.z; t.z.y = a.y.z; t.z.z = a.z.z; t.z.w = a.w.z;
+	t.w.x = a.x.w; t.w.y = a.y.w; t.w.z = a.z.w; t.w.w = a.w.w;
+
+	return t;
+}
+
+ProjectionTransform projection_inverse(const ProjectionTransform& a);
+
+ccl_device_inline ProjectionTransform make_projection(
+	float a, float b, float c, float d,
+	float e, float f, float g, float h,
+	float i, float j, float k, float l,
+	float m, float n, float o, float p)
+{
+	ProjectionTransform t;
+
+	t.x.x = a; t.x.y = b; t.x.z = c; t.x.w = d;
+	t.y.x = e; t.y.y = f; t.y.z = g; t.y.w = h;
+	t.z.x = i; t.z.y = j; t.z.z = k; t.z.w = l;
+	t.w.x = m; t.w.y = n; t.w.z = o; t.w.w = p;
+
+	return t;
+}
+ccl_device_inline ProjectionTransform projection_identity()
+{
+	return make_projection(
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f);
+}
+
+ccl_device_inline ProjectionTransform operator*(const ProjectionTransform& a, const ProjectionTransform& b)
+{
+	ProjectionTransform c = projection_transpose(b);
+	ProjectionTransform t;
+
+	t.x = make_float4(dot(a.x, c.x), dot(a.x, c.y), dot(a.x, c.z), dot(a.x, c.w));
+	t.y = make_float4(dot(a.y, c.x), dot(a.y, c.y), dot(a.y, c.z), dot(a.y, c.w));
+	t.z = make_float4(dot(a.z, c.x), dot(a.z, c.y), dot(a.z, c.z), dot(a.z, c.w));
+	t.w = make_float4(dot(a.w, c.x), dot(a.w, c.y), dot(a.w, c.z), dot(a.w, c.w));
+
+	return t;
+}
+
+ccl_device_inline ProjectionTransform operator*(const ProjectionTransform& a, const Transform& b)
+{
+	return a * ProjectionTransform(b);
+}
+
+ccl_device_inline ProjectionTransform operator*(const Transform& a, const ProjectionTransform& b)
+{
+	return ProjectionTransform(a) * b;
+}
+
+ccl_device_inline void print_projection(const char *label, const ProjectionTransform& t)
+{
+	print_float4(label, t.x);
+	print_float4(label, t.y);
+	print_float4(label, t.z);
+	print_float4(label, t.w);
+	printf("\n");
+}
+
+ccl_device_inline ProjectionTransform projection_perspective(float fov, float n, float f)
+{
+	ProjectionTransform persp = make_projection(
+		1, 0, 0, 0,
+		0, 1, 0, 0,
+		0, 0, f / (f - n), -f*n / (f - n),
+		0, 0, 1, 0);
+
+	float inv_angle = 1.0f/tanf(0.5f*fov);
+
+	Transform scale = transform_scale(inv_angle, inv_angle, 1);
+
+	return scale * persp;
+}
+
+ccl_device_inline ProjectionTransform projection_orthographic(float znear, float zfar)
+{
+	Transform t =
+		transform_scale(1.0f, 1.0f, 1.0f / (zfar-znear)) *
+		transform_translate(0.0f, 0.0f, -znear);
+
+	return ProjectionTransform(t);
+}
+
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_PROJECTION_H__ */
+
diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h
new file mode 100644
index 00000000000..17a55a14d0b
--- /dev/null
+++ b/intern/cycles/util/util_rect.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_RECT_H__
+#define __UTIL_RECT_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Rectangles are represented as a int4 containing the coordinates of the lower-left and
+ * upper-right corners in the order (x0, y0, x1, y1). */
+
+ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h)
+{
+	return make_int4(x0, y0, x0 + w, y0 + h);
+}
+
+ccl_device_inline int4 rect_expand(int4 rect, int d)
+{
+	return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d);
+}
+
+/* Returns the intersection of two rects. */
+ccl_device_inline int4 rect_clip(int4 a, int4 b)
+{
+	return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+}
+
+ccl_device_inline bool rect_is_valid(int4 rect)
+{
+	return (rect.z > rect.x) && (rect.w > rect.y);
+}
+
+/* Returns the local row-major index of the pixel inside the rect. */
+ccl_device_inline int coord_to_local_index(int4 rect, int x, int y)
+{
+	int w = rect.z - rect.x;
+	return (y - rect.y) * w + (x - rect.x);
+}
+
+/* Finds the coordinates of a pixel given by its row-major index in the rect,
+ * and returns whether the pixel is inside it. */
+ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y)
+{
+	int w = rect.z - rect.x;
+	*x = (idx % w) + rect.x;
+	*y = (idx / w) + rect.y;
+	return (*y < rect.w);
+}
+
+ccl_device_inline int rect_size(int4 rect)
+{
+	return (rect.z - rect.x) * (rect.w - rect.y);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_RECT_H__ */
+
diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp
index de2df612578..f90439c188b 100644
--- a/intern/cycles/util/util_simd.cpp
+++ b/intern/cycles/util/util_simd.cpp
@@ -19,7 +19,7 @@
     (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__))
 
 #define __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 756bd15ed25..04341451afb 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,19 +18,37 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
+#ifndef __KERNEL_GPU__
+
 #include <limits>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
 
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
-struct sseb;
-struct ssei;
-struct ssef;
-
 extern const __m128 _mm_lookupmask_ps[16];
 
 /* Special Types */
@@ -328,12 +346,12 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
-
-/* Emulation of SSE4 functions with SSE3 */
+/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
+ * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
+ * platforms when compiling code outside the kernel. */
+#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -341,45 +359,51 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_TO_ZERO           0x03
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
-#define _mm_blendv_ps __emu_mm_blendv_ps
-__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
-    return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
+#undef _mm_blendv_ps
+#define _mm_blendv_ps _mm_blendv_ps_emu
+__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask)
+{
+    __m128i isignmask = _mm_set1_epi32(0x80000000);
+    __m128 signmask = _mm_castsi128_ps(isignmask);
+    __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
+    __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
+    __m128 cmpmask = _mm_castsi128_ps(icmpmask);
+    return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
 }
 
-#define _mm_blend_ps __emu_mm_blend_ps
-__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
+#undef _mm_blend_ps
+#define _mm_blend_ps _mm_blend_ps_emu
+__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask)
+{
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
-__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
+#undef _mm_blendv_epi8
+#define _mm_blendv_epi8 _mm_blendv_epi8_emu
+__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask)
+{
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
-  return rvalue;
-}
-
-
-#define _mm_min_epi32 __emu_mm_min_epi32
-__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
+#undef _mm_min_epi32
+#define _mm_min_epi32 _mm_min_epi32_emu
+__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input)
+{
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
-#define _mm_max_epi32 __emu_mm_max_epi32
-__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
+#undef _mm_max_epi32
+#define _mm_max_epi32 _mm_max_epi32_emu
+__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input)
+{
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
-#define _mm_extract_epi32 __emu_mm_extract_epi32
-__forceinline int _mm_extract_epi32( __m128i input, const int index ) {
-  switch ( index ) {
+#undef _mm_extract_epi32
+#define _mm_extract_epi32 _mm_extract_epi32_emu
+__forceinline int _mm_extract_epi32_emu( __m128i input, const int index)
+{
+  switch(index) {
   case 0: return _mm_cvtsi128_si32(input);
   case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
   case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
@@ -388,24 +412,27 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   }
 }
 
-#define _mm_insert_epi32 __emu_mm_insert_epi32
-__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
+#undef _mm_insert_epi32
+#define _mm_insert_epi32 _mm_insert_epi32_emu
+__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index)
+{
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32* ptr = (int32*)&input; return ptr[index];
+#undef _mm_insert_ps
+#define _mm_insert_ps _mm_insert_ps_emu
+__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index)
+{
+	assert(index < 0x100);
+	((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6];
+	return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value);
 }
 
-#define _mm_insert_ps __emu_mm_insert_ps
-__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
-{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
-
-#define _mm_round_ps __emu_mm_round_ps
-__forceinline __m128 _mm_round_ps( __m128 value, const int flags )
+#undef _mm_round_ps
+#define _mm_round_ps _mm_round_ps_emu
+__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
 {
-  switch ( flags )
+  switch(flags)
   {
   case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
   case _MM_FROUND_TO_NEG_INF    : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
@@ -415,20 +442,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#ifdef _M_X64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#endif
-
-#endif
+#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #else  /* __KERNEL_SSE2__ */
 
@@ -440,22 +454,37 @@ ccl_device_inline int bitscan(int value)
 {
 	assert(value != 0);
 	int bit = 0;
-	while(value >>= 1) {
+	while((value & (1 << bit)) == 0) {
 		++bit;
 	}
 	return bit;
 }
 
+ccl_device_inline int __bsr(int value)
+{
+	assert(value != 0);
+	int bit = 0;
+	while(value >>= 1) {
+		++bit;
+	}
+	return bit;
+}
 
 #endif /* __KERNEL_SSE2__ */
 
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)  || \
+	defined(__KERNEL_SSE3__)  || \
+	defined(__KERNEL_SSSE3__) || \
+	defined(__KERNEL_SSE41__) || \
+	defined(__KERNEL_AVX__)   || \
+	defined(__KERNEL_AVX2__)
+	/* do nothing */
+#endif
+
 CCL_NAMESPACE_END
 
-#include "util_math.h"
-#include "util_sseb.h"
-#include "util_ssei.h"
-#include "util_ssef.h"
-#include "util_avxf.h"
+#endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp
index 5730986cc4f..6dda8469907 100644
--- a/intern/cycles/util/util_sky_model.cpp
+++ b/intern/cycles/util/util_sky_model.cpp
@@ -97,8 +97,8 @@ All instructions on how to use this code are in the accompanying header file.
 
 */
 
-#include "util_sky_model.h"
-#include "util_sky_model_data.h"
+#include "util/util_sky_model.h"
+#include "util/util_sky_model_data.h"
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 6e669701f3b..977976c3fc0 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct ssei;
+struct ssef;
+
 /*! 4-wide SSE bool type. */
 struct sseb
 {
@@ -116,7 +119,7 @@ __forceinline const sseb unpacklo( const sseb& a, const sseb& b ) { return _mm_u
 __forceinline const sseb unpackhi( const sseb& a, const sseb& b ) { return _mm_unpackhi_ps(a, b); }
 
 template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a ) {
-	return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+	return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
 }
 
 template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) {
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index 2f5295b5463..bb007ff84a9 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE float type. */
 struct ssef
 {
@@ -514,12 +517,12 @@ ccl_device_inline float len3(const ssef& a)
 /* faster version for SSSE3 */
 typedef ssei shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
 }
@@ -534,12 +537,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
 /* somewhat slower version for SSE2 */
 typedef int shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return 0;
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return 1;
 }
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f62569268c..ef2a9e68b7d 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE integer type. */
 struct ssei
 {
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
 
 #else
 
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
 __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
 
 #endif
diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h
index d7aab5b250c..79a535bd170 100644
--- a/intern/cycles/util/util_stack_allocator.h
+++ b/intern/cycles/util/util_stack_allocator.h
@@ -20,9 +20,6 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
-
 CCL_NAMESPACE_BEGIN
 
 /* Stack allocator for the use with STL. */
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index 033d85e8ec6..e90049254de 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {};
 #    endif  /* __COUNTER__ */
 #  endif  /* C++11 or MSVC2015 */
 #else  /* __KERNEL_GPU__ */
-#  define static_assert(statement, message)
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
 #endif  /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index c21a8488c81..7667f58eb7d 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_STATS_H__
 #define __UTIL_STATS_H__
 
-#include "util_atomic.h"
+#include "util/util_atomic.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,7 +30,7 @@ public:
 
 	void mem_alloc(size_t size) {
 		atomic_add_and_fetch_z(&mem_used, size);
-		atomic_update_max_z(&mem_peak, mem_used);
+		atomic_fetch_and_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 5594aa8edb6..94ad512982c 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,9 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_windows.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 #  ifndef vsnprintf
@@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other)
 string string_remove_trademark(const string &s)
 {
 	string result = s;
+
+	/* Special case, so we don;t leave sequential spaces behind. */
+	/* TODO(sergey): Consider using regex perhaps? */
+	string_replace(result, " (TM)", "");
+	string_replace(result, " (R)", "");
+
 	string_replace(result, "(TM)", "");
 	string_replace(result, "(R)", "");
 
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index 7aeed96f00b..e2c105db9c1 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <sstream>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 87d885c44cf..5f5211228c5 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include "util_system.h"
+#include "util/util_system.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_types.h"
-#include "util_string.h"
+#include "util/util_logging.h"
+#include "util/util_types.h"
+#include "util/util_string.h"
 
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
@@ -234,35 +233,34 @@ static CPUCapabilities& system_cpu_capabilities()
 bool system_cpu_support_sse2()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return DebugFlags().cpu.sse2 && caps.sse && caps.sse2;
+	return caps.sse && caps.sse2;
 }
 
 bool system_cpu_support_sse3()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return DebugFlags().cpu.sse3 &&
-	       caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
 }
 
 bool system_cpu_support_sse41()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return DebugFlags().cpu.sse41 &&
-	       caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
 }
 
 bool system_cpu_support_avx()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return DebugFlags().cpu.avx &&
-	       caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 &&
+	       caps.sse41 && caps.avx;
 }
 
 bool system_cpu_support_avx2()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return DebugFlags().cpu.avx2 &&
-	       caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 &&
+	       caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 &&
+	       caps.bmi2;
 }
 #else
 
@@ -292,5 +290,26 @@ bool system_cpu_support_avx2()
 
 #endif
 
+size_t system_physical_ram()
+{
+#ifdef _WIN32
+	MEMORYSTATUSEX ram;
+	ram.dwLength = sizeof (ram);
+	GlobalMemoryStatusEx(&ram);
+	return ram.ullTotalPhys * 1024;
+#elif defined(__APPLE__)
+	uint64_t ram = 0;
+	size_t len = sizeof(ram);
+	if (sysctlbyname("hw.memsize", &ram, &len, NULL, 0) == 0) {
+		return ram;
+	}
+	return 0;
+#else
+	size_t ps = sysconf(_SC_PAGESIZE);
+	size_t pn = sysconf(_SC_PHYS_PAGES);
+	return ps * pn;
+#endif
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index ff61b260bed..e55dd6dd136 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,6 +42,8 @@ bool system_cpu_support_sse41();
 bool system_cpu_support_avx();
 bool system_cpu_support_avx2();
 
+size_t system_physical_ram();
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 0d1fed3ebbf..9df1096de8a 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_system.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 //#define THREADING_DEBUG_ENABLED
 
@@ -206,9 +205,9 @@ void TaskScheduler::init(int num_threads)
 		threads.resize(num_threads);
 
 		const int num_groups = system_cpu_group_count();
-		unsigned short num_process_groups;
+		unsigned short num_process_groups = 0;
 		vector<unsigned short> process_groups;
-		int current_group_threads;
+		int current_group_threads = 0;
 		if(num_groups > 1) {
 			process_groups.resize(num_groups);
 			num_process_groups = system_cpu_process_groups(num_groups, 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index 0b82f14f66f..3ebfb007e40 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -17,10 +17,10 @@
 #ifndef __UTIL_TASK_H__
 #define __UTIL_TASK_H__
 
-#include "util_list.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index aff928ea2ee..4b5f630427d 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -20,63 +20,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* Texture limits on devices. */
-
-/* CPU */
-#define TEX_NUM_FLOAT4_CPU		1024
-#define TEX_NUM_BYTE4_CPU		1024
-#define TEX_NUM_HALF4_CPU		1024
-#define TEX_NUM_FLOAT_CPU		1024
-#define TEX_NUM_BYTE_CPU		1024
-#define TEX_NUM_HALF_CPU		1024
-#define TEX_START_FLOAT4_CPU	0
-#define TEX_START_BYTE4_CPU		TEX_NUM_FLOAT4_CPU
-#define TEX_START_HALF4_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU)
-#define TEX_START_FLOAT_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU)
-#define TEX_START_BYTE_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU)
-#define TEX_START_HALF_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU + TEX_NUM_BYTE_CPU)
-
-/* CUDA (Geforce 4xx and 5xx) */
-#define TEX_NUM_FLOAT4_CUDA		5
-#define TEX_NUM_BYTE4_CUDA		85
-#define TEX_NUM_HALF4_CUDA		0
-#define TEX_NUM_FLOAT_CUDA		0
-#define TEX_NUM_BYTE_CUDA		0
-#define TEX_NUM_HALF_CUDA		0
-#define TEX_START_FLOAT4_CUDA	0
-#define TEX_START_BYTE4_CUDA	TEX_NUM_FLOAT4_CUDA
-#define TEX_START_HALF4_CUDA	(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA)
-#define TEX_START_FLOAT_CUDA	(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA)
-#define TEX_START_BYTE_CUDA		(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA)
-#define TEX_START_HALF_CUDA		(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA)
-
-/* CUDA (Kepler, Geforce 6xx and above) */
-#define TEX_NUM_FLOAT4_CUDA_KEPLER		1024
-#define TEX_NUM_BYTE4_CUDA_KEPLER		1024
-#define TEX_NUM_HALF4_CUDA_KEPLER		1024
-#define TEX_NUM_FLOAT_CUDA_KEPLER		1024
-#define TEX_NUM_BYTE_CUDA_KEPLER		1024
-#define TEX_NUM_HALF_CUDA_KEPLER		1024
-#define TEX_START_FLOAT4_CUDA_KEPLER	0
-#define TEX_START_BYTE4_CUDA_KEPLER		TEX_NUM_FLOAT4_CUDA_KEPLER
-#define TEX_START_HALF4_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER)
-#define TEX_START_FLOAT_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER)
-#define TEX_START_BYTE_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER)
-#define TEX_START_HALF_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER + TEX_NUM_BYTE_CUDA_KEPLER)
-
-/* OpenCL */
-#define TEX_NUM_FLOAT4_OPENCL	1024
-#define TEX_NUM_BYTE4_OPENCL	1024
-#define TEX_NUM_HALF4_OPENCL	0
-#define TEX_NUM_FLOAT_OPENCL	1024
-#define TEX_NUM_BYTE_OPENCL		1024
-#define TEX_NUM_HALF_OPENCL		0
-#define TEX_START_FLOAT4_OPENCL	0
-#define TEX_START_BYTE4_OPENCL	TEX_NUM_FLOAT4_OPENCL
-#define TEX_START_HALF4_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL)
-#define TEX_START_FLOAT_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL)
-#define TEX_START_BYTE_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL)
-#define TEX_START_HALF_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL + TEX_NUM_BYTE_OPENCL)
-
+#define TEX_NUM_MAX (INT_MAX >> 4)
 
 /* Color to use when textures are not found. */
 #define TEX_IMAGE_MISSING_R 1
@@ -84,6 +28,63 @@ CCL_NAMESPACE_BEGIN
 #define TEX_IMAGE_MISSING_B 1
 #define TEX_IMAGE_MISSING_A 1
 
+/* Texture type. */
+#define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK)
+
+/* Interpolation types for textures
+ * cuda also use texture space to store other objects */
+typedef enum InterpolationType {
+	INTERPOLATION_NONE = -1,
+	INTERPOLATION_LINEAR = 0,
+	INTERPOLATION_CLOSEST = 1,
+	INTERPOLATION_CUBIC = 2,
+	INTERPOLATION_SMART = 3,
+
+	INTERPOLATION_NUM_TYPES,
+} InterpolationType;
+
+/* Texture types
+ * Since we store the type in the lower bits of a flat index,
+ * the shift and bit mask constant below need to be kept in sync. */
+typedef enum ImageDataType {
+	IMAGE_DATA_TYPE_FLOAT4 = 0,
+	IMAGE_DATA_TYPE_BYTE4 = 1,
+	IMAGE_DATA_TYPE_HALF4 = 2,
+	IMAGE_DATA_TYPE_FLOAT = 3,
+	IMAGE_DATA_TYPE_BYTE = 4,
+	IMAGE_DATA_TYPE_HALF = 5,
+
+	IMAGE_DATA_NUM_TYPES
+} ImageDataType;
+
+#define IMAGE_DATA_TYPE_SHIFT 3
+#define IMAGE_DATA_TYPE_MASK 0x7
+
+/* Extension types for textures.
+ *
+ * Defines how the image is extrapolated past its original bounds. */
+typedef enum ExtensionType {
+	/* Cause the image to repeat horizontally and vertically. */
+	EXTENSION_REPEAT = 0,
+	/* Extend by repeating edge pixels of the image. */
+	EXTENSION_EXTEND = 1,
+	/* Clip to image size and set exterior pixels as transparent. */
+	EXTENSION_CLIP = 2,
+
+	EXTENSION_NUM_TYPES,
+} ExtensionType;
+
+typedef struct TextureInfo {
+	/* Pointer, offset or texture depending on device. */
+	uint64_t data;
+	/* Buffer number for OpenCL. */
+	uint cl_buffer;
+	/* Interpolation and extension type. */
+	uint interpolation, extension;
+	/* Dimensions. */
+	uint width, height, depth;
+} TextureInfo;
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TEXTURE_H__ */
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 3db8b4bd197..c66aa484264 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_thread.h"
+#include "util/util_thread.h"
 
-#include "util_system.h"
-#include "util_windows.h"
+#include "util/util_system.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -26,7 +26,11 @@ thread::thread(function<void(void)> run_cb, int group)
     joined_(false),
 	group_(group)
 {
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+	thread_ = std::thread(&thread::run, this);
+#else
 	pthread_create(&pthread_id_, NULL, run, (void*)this);
+#endif
 }
 
 thread::~thread()
@@ -60,7 +64,17 @@ void *thread::run(void *arg)
 bool thread::join()
 {
 	joined_ = true;
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+	try {
+		thread_.join();
+		return true;
+	}
+	catch (const std::system_error&) {
+		return false;
+	}
+#else
 	return pthread_join(pthread_id_, NULL) == 0;
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 427c633d2ce..4d8f464359c 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -24,15 +24,21 @@
 #  include <functional>
 #else
 #  include <boost/thread.hpp>
+#  include <pthread.h>
 #endif
-#include <pthread.h>
 #include <queue>
 
+#ifdef _WIN32
+#  include "util_windows.h"
+#else
+#  include <pthread.h>
+#endif
+
 #ifdef __APPLE__
 #  include <libkern/OSAtomic.h>
 #endif
 
-#include "util_function.h"
+#include "util/util_function.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -60,7 +66,11 @@ public:
 
 protected:
 	function<void(void)> run_cb_;
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+	std::thread thread_;
+#else
 	pthread_t pthread_id_;
+#endif
 	bool joined_;
 	int group_;
 };
@@ -81,7 +91,24 @@ public:
 	inline void unlock() {
 		OSSpinLockUnlock(&spin_);
 	}
-#else  /* __APPLE__ */
+#elif defined(_WIN32)
+	inline thread_spin_lock() {
+		const DWORD SPIN_COUNT = 50000;
+		InitializeCriticalSectionAndSpinCount(&cs_, SPIN_COUNT);
+	}
+
+	inline ~thread_spin_lock() {
+		DeleteCriticalSection(&cs_);
+	}
+
+	inline void lock() {
+		EnterCriticalSection(&cs_);
+	}
+
+	inline void unlock() {
+		LeaveCriticalSection(&cs_);
+	}
+#else
 	inline thread_spin_lock() {
 		pthread_spin_init(&spin_, 0);
 	}
@@ -97,15 +124,34 @@ public:
 	inline void unlock() {
 		pthread_spin_unlock(&spin_);
 	}
-#endif  /* __APPLE__ */
+#endif
 protected:
 #ifdef __APPLE__
 	OSSpinLock spin_;
+#elif defined(_WIN32)
+	CRITICAL_SECTION cs_;
 #else
 	pthread_spinlock_t spin_;
 #endif
 };
 
+class thread_scoped_spin_lock {
+public:
+	explicit thread_scoped_spin_lock(thread_spin_lock& lock)
+	        : lock_(lock) {
+		lock_.lock();
+	}
+
+	~thread_scoped_spin_lock() {
+		lock_.unlock();
+	}
+
+	/* TODO(sergey): Implement manual control over lock/unlock. */
+
+protected:
+	thread_spin_lock& lock_;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_THREAD_H__ */
diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp
index 59c963cfafb..7c39aa294bf 100644
--- a/intern/cycles/util/util_time.cpp
+++ b/intern/cycles/util/util_time.cpp
@@ -16,8 +16,8 @@
 
 #include <stdlib.h>
 
-#include "util_time.h"
-#include "util_windows.h"
+#include "util/util_time.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 
diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h
index 65798244111..f03aa590e9b 100644
--- a/intern/cycles/util/util_time.h
+++ b/intern/cycles/util/util_time.h
@@ -37,7 +37,7 @@ public:
 	~scoped_timer()
 	{
 		if(value_ != NULL) {
-			*value_ = time_dt() - time_start_;
+			*value_ = get_time();
 		}
 	}
 
@@ -46,6 +46,11 @@ public:
 		return time_start_;
 	}
 
+	double get_time() const
+	{
+		return time_dt() - time_start_;
+	}
+
 protected:
 	double *value_;
 	double time_start_;
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 2f10540c94e..206c3da23eb 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,10 +46,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_transform.h"
+#include "util/util_projection.h"
+#include "util/util_transform.h"
 
-#include "util_boundbox.h"
-#include "util_math.h"
+#include "util/util_boundbox.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -129,9 +130,9 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
 	return true;
 }
 
-Transform transform_inverse(const Transform& tfm)
+ProjectionTransform projection_inverse(const ProjectionTransform& tfm)
 {
-	Transform tfmR = transform_identity();
+	ProjectionTransform tfmR = projection_identity();
 	float M[4][4], R[4][4];
 
 	memcpy(R, &tfmR, sizeof(R));
@@ -145,7 +146,7 @@ Transform transform_inverse(const Transform& tfm)
 		M[2][2] += 1e-8f;
 
 		if(UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-			return transform_identity();
+			return projection_identity();
 		}
 	}
 
@@ -154,6 +155,19 @@ Transform transform_inverse(const Transform& tfm)
 	return tfmR;
 }
 
+Transform transform_inverse(const Transform& tfm)
+{
+	ProjectionTransform projection(tfm);
+	return projection_to_transform(projection_inverse(projection));
+}
+
+Transform transform_transposed_inverse(const Transform& tfm)
+{
+	ProjectionTransform projection(tfm);
+	ProjectionTransform iprojection = projection_inverse(projection);
+	return projection_to_transform(projection_transpose(iprojection));
+}
+
 /* Motion Transform */
 
 float4 transform_to_quat(const Transform& tfm)
@@ -202,14 +216,14 @@ float4 transform_to_quat(const Transform& tfm)
 	return qt;
 }
 
-static void transform_decompose(Transform *decomp, const Transform *tfm)
+static void transform_decompose(DecomposedTransform *decomp, const Transform *tfm)
 {
 	/* extract translation */
 	decomp->y = make_float4(tfm->x.w, tfm->y.w, tfm->z.w, 0.0f);
 
 	/* extract rotation */
 	Transform M = *tfm;
-	M.x.w = 0.0f; M.y.w = 0.0f; M.z.w = 0.0f; M.w.w = 1.0f;
+	M.x.w = 0.0f; M.y.w = 0.0f; M.z.w = 0.0f;
 
 	Transform R = M;
 	float norm;
@@ -217,9 +231,9 @@ static void transform_decompose(Transform *decomp, const Transform *tfm)
 
 	do {
 		Transform Rnext;
-		Transform Rit = transform_inverse(transform_transpose(R));
+		Transform Rit = transform_transposed_inverse(R);
 
-		for(int i = 0; i < 4; i++)
+		for(int i = 0; i < 3; i++)
 			for(int j = 0; j < 4; j++)
 				Rnext[i][j] = 0.5f * (R[i][j] + Rit[i][j]);
 		
@@ -247,30 +261,18 @@ static void transform_decompose(Transform *decomp, const Transform *tfm)
 	decomp->w = make_float4(scale.y.z, scale.z.x, scale.z.y, scale.z.z);
 }
 
-void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid)
+void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size)
 {
-	Transform pre, post;
-
-	transform_decompose(&pre, &motion->pre);
-	transform_decompose(&decomp->mid, mid);
-	transform_decompose(&post, &motion->post);
-
-	/* ensure rotation around shortest angle, negated quaternions are the same
-	 * but this means we don't have to do the check in quat_interpolate */
-	if(dot(decomp->mid.x, post.x) < 0.0f)
-		decomp->mid.x = -decomp->mid.x;
-	if(dot(pre.x, decomp->mid.x) < 0.0f)
-		pre.x = -pre.x;
-	
-	/* drop scale of pre/post */
-	pre.y.w = decomp->mid.y.w;
-	post.y.w = decomp->mid.y.w;
-
-	/* store translation/rotation part of pre/post */
-	decomp->pre_x = pre.x;
-	decomp->pre_y = pre.y;
-	decomp->post_x = post.x;
-	decomp->post_y = post.y;
+	for(size_t i = 0; i < size; i++) {
+		transform_decompose(decomp + i, motion + i);
+
+		if(i > 0) {
+			/* Ensure rotation around shortest angle, negated quaternions are the same
+			 * but this means we don't have to do the check in quat_interpolate */
+			if(dot(decomp[i-1].x, decomp[i].x) < 0.0f)
+				decomp[i-1].x = -decomp[i-1].x;
+		}
+	}
 }
 
 Transform transform_from_viewplane(BoundBox2D& viewplane)
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index a0695f20488..987f4dac777 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -21,15 +21,15 @@
 #include <string.h>
 #endif
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Data Types */
+/* Affine transformation, stored as 4x3 matrix. */
 
 typedef struct Transform {
-	float4 x, y, z, w; /* rows */
+	float4 x, y, z;
 
 #ifndef __KERNEL_GPU__
 	float4 operator[](int i) const { return *(&x + i); }
@@ -37,40 +37,16 @@ typedef struct Transform {
 #endif
 } Transform;
 
-/* transform decomposed in rotation/translation/scale. we use the same data
+/* Transform decomposed in rotation/translation/scale. we use the same data
  * structure as Transform, and tightly pack decomposition into it. first the
- * rotation (4), then translation (3), then 3x3 scale matrix (9).
- *
- * For the DecompMotionTransform we drop scale from pre/post. */
-
-typedef struct ccl_may_alias MotionTransform {
-	Transform pre;
-	Transform mid;
-	Transform post;
-} MotionTransform;
-
-typedef struct DecompMotionTransform {
-	Transform mid;
-	float4 pre_x, pre_y;
-	float4 post_x, post_y;
-} DecompMotionTransform;
+ * rotation (4), then translation (3), then 3x3 scale matrix (9). */
 
-typedef struct PerspectiveMotionTransform {
-	Transform pre;
-	Transform post;
-} PerspectiveMotionTransform;
+typedef struct DecomposedTransform {
+	float4 x, y, z, w;
+} DecomposedTransform;
 
 /* Functions */
 
-ccl_device_inline float3 transform_perspective(const Transform *t, const float3 a)
-{
-	float4 b = make_float4(a.x, a.y, a.z, 1.0f);
-	float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
-	float w = dot(t->w, b);
-
-	return (w != 0.0f)? c/w: make_float3(0.0f, 0.0f, 0.0f);
-}
-
 ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
 {
 	/* TODO(sergey): Disabled for now, causes crashes in certain cases. */
@@ -81,7 +57,7 @@ ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
 	x = _mm_loadu_ps(&t->x.x);
 	y = _mm_loadu_ps(&t->y.x);
 	z = _mm_loadu_ps(&t->z.x);
-	w = _mm_loadu_ps(&t->w.x);
+	w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
 
 	_MM_TRANSPOSE4_PS(x, y, z, w);
 
@@ -137,29 +113,15 @@ ccl_device_inline float3 transform_direction_transposed(const Transform *t, cons
 	return make_float3(dot(x, a), dot(y, a), dot(z, a));
 }
 
-ccl_device_inline Transform transform_transpose(const Transform a)
-{
-	Transform t;
-
-	t.x.x = a.x.x; t.x.y = a.y.x; t.x.z = a.z.x; t.x.w = a.w.x;
-	t.y.x = a.x.y; t.y.y = a.y.y; t.y.z = a.z.y; t.y.w = a.w.y;
-	t.z.x = a.x.z; t.z.y = a.y.z; t.z.z = a.z.z; t.z.w = a.w.z;
-	t.w.x = a.x.w; t.w.y = a.y.w; t.w.z = a.z.w; t.w.w = a.w.w;
-
-	return t;
-}
-
 ccl_device_inline Transform make_transform(float a, float b, float c, float d,
                                            float e, float f, float g, float h,
-                                           float i, float j, float k, float l,
-                                           float m, float n, float o, float p)
+                                           float i, float j, float k, float l)
 {
 	Transform t;
 
 	t.x.x = a; t.x.y = b; t.x.z = c; t.x.w = d;
 	t.y.x = e; t.y.y = f; t.y.z = g; t.y.w = h;
 	t.z.x = i; t.z.y = j; t.z.z = k; t.z.w = l;
-	t.w.x = m; t.w.y = n; t.w.z = o; t.w.w = p;
 
 	return t;
 }
@@ -173,21 +135,22 @@ ccl_device_inline Transform make_transform_frame(float3 N)
 	const float3 dy = normalize(cross(N, dx));
 	return make_transform(dx.x, dx.y, dx.z, 0.0f,
 	                      dy.x, dy.y, dy.z, 0.0f,
-	                      N.x , N.y,  N.z,  0.0f,
-	                      0.0f, 0.0f, 0.0f, 1.0f);
+	                      N.x , N.y,  N.z,  0.0f);
 }
 
 #ifndef __KERNEL_GPU__
 
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
 {
-	Transform c = transform_transpose(b);
-	Transform t;
+	float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
+	float4 c_y = make_float4(b.x.y, b.y.y, b.z.y, 0.0f);
+	float4 c_z = make_float4(b.x.z, b.y.z, b.z.z, 0.0f);
+	float4 c_w = make_float4(b.x.w, b.y.w, b.z.w, 1.0f);
 
-	t.x = make_float4(dot(a.x, c.x), dot(a.x, c.y), dot(a.x, c.z), dot(a.x, c.w));
-	t.y = make_float4(dot(a.y, c.x), dot(a.y, c.y), dot(a.y, c.z), dot(a.y, c.w));
-	t.z = make_float4(dot(a.z, c.x), dot(a.z, c.y), dot(a.z, c.z), dot(a.z, c.w));
-	t.w = make_float4(dot(a.w, c.x), dot(a.w, c.y), dot(a.w, c.z), dot(a.w, c.w));
+	Transform t;
+	t.x = make_float4(dot(a.x, c_x), dot(a.x, c_y), dot(a.x, c_z), dot(a.x, c_w));
+	t.y = make_float4(dot(a.y, c_x), dot(a.y, c_y), dot(a.y, c_z), dot(a.y, c_w));
+	t.z = make_float4(dot(a.z, c_x), dot(a.z, c_y), dot(a.z, c_z), dot(a.z, c_w));
 
 	return t;
 }
@@ -197,7 +160,6 @@ ccl_device_inline void print_transform(const char *label, const Transform& t)
 	print_float4(label, t.x);
 	print_float4(label, t.y);
 	print_float4(label, t.z);
-	print_float4(label, t.w);
 	printf("\n");
 }
 
@@ -206,8 +168,7 @@ ccl_device_inline Transform transform_translate(float3 t)
 	return make_transform(
 		1, 0, 0, t.x,
 		0, 1, 0, t.y,
-		0, 0, 1, t.z,
-		0, 0, 0, 1);
+		0, 0, 1, t.z);
 }
 
 ccl_device_inline Transform transform_translate(float x, float y, float z)
@@ -220,8 +181,7 @@ ccl_device_inline Transform transform_scale(float3 s)
 	return make_transform(
 		s.x, 0, 0, 0,
 		0, s.y, 0, 0,
-		0, 0, s.z, 0,
-		0, 0, 0, 1);
+		0, 0, s.z, 0);
 }
 
 ccl_device_inline Transform transform_scale(float x, float y, float z)
@@ -229,21 +189,6 @@ ccl_device_inline Transform transform_scale(float x, float y, float z)
 	return transform_scale(make_float3(x, y, z));
 }
 
-ccl_device_inline Transform transform_perspective(float fov, float n, float f)
-{
-	Transform persp = make_transform(
-		1, 0, 0, 0,
-		0, 1, 0, 0,
-		0, 0, f / (f - n), -f*n / (f - n),
-		0, 0, 1, 0);
-
-	float inv_angle = 1.0f/tanf(0.5f*fov);
-
-	Transform scale = transform_scale(inv_angle, inv_angle, 1);
-
-	return scale * persp;
-}
-
 ccl_device_inline Transform transform_rotate(float angle, float3 axis)
 {
 	float s = sinf(angle);
@@ -266,9 +211,7 @@ ccl_device_inline Transform transform_rotate(float angle, float3 axis)
 		axis.z*axis.x*t - s*axis.y,
 		axis.z*axis.y*t + s*axis.x,
 		axis.z*axis.z*t + c,
-		0.0f,
-
-		0.0f, 0.0f, 0.0f, 1.0f);
+		0.0f);
 }
 
 /* Euler is assumed to be in XYZ order. */
@@ -280,12 +223,6 @@ ccl_device_inline Transform transform_euler(float3 euler)
 		transform_rotate(euler.x, make_float3(1.0f, 0.0f, 0.0f));
 }
 
-ccl_device_inline Transform transform_orthographic(float znear, float zfar)
-{
-	return transform_scale(1.0f, 1.0f, 1.0f / (zfar-znear)) *
-		transform_translate(0.0f, 0.0f, -znear);
-}
-
 ccl_device_inline Transform transform_identity()
 {
 	return transform_scale(1.0f, 1.0f, 1.0f);
@@ -314,20 +251,20 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
 }
 
 Transform transform_inverse(const Transform& a);
+Transform transform_transposed_inverse(const Transform& a);
 
 ccl_device_inline bool transform_uniform_scale(const Transform& tfm, float& scale)
 {
 	/* the epsilon here is quite arbitrary, but this function is only used for
-	 * surface area and bump, where we except it to not be so sensitive */
-	Transform ttfm = transform_transpose(tfm);
+	 * surface area and bump, where we expect it to not be so sensitive */
 	float eps = 1e-6f;
 	
 	float sx = len_squared(float4_to_float3(tfm.x));
 	float sy = len_squared(float4_to_float3(tfm.y));
 	float sz = len_squared(float4_to_float3(tfm.z));
-	float stx = len_squared(float4_to_float3(ttfm.x));
-	float sty = len_squared(float4_to_float3(ttfm.y));
-	float stz = len_squared(float4_to_float3(ttfm.z));
+	float stx = len_squared(transform_get_column(&tfm, 0));
+	float sty = len_squared(transform_get_column(&tfm, 1));
+	float stz = len_squared(transform_get_column(&tfm, 2));
 
 	if(fabsf(sx - sy) < eps && fabsf(sx - sz) < eps &&
 	   fabsf(sx - stx) < eps && fabsf(sx - sty) < eps &&
@@ -365,7 +302,6 @@ ccl_device_inline Transform transform_empty()
 	return make_transform(
 		0, 0, 0, 0,
 		0, 0, 0, 0,
-		0, 0, 0, 0,
 		0, 0, 0, 0);
 }
 
@@ -422,12 +358,11 @@ ccl_device_inline Transform transform_quick_inverse(Transform M)
 	R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
 	R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
 	R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
-	R.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
 
 	return R;
 }
 
-ccl_device_inline void transform_compose(Transform *tfm, const Transform *decomp)
+ccl_device_inline void transform_compose(Transform *tfm, const DecomposedTransform *decomp)
 {
 	/* rotation */
 	float q0, q1, q2, q3, qda, qdb, qdc, qaa, qab, qac, qbb, qbc, qcc;
@@ -460,59 +395,30 @@ ccl_device_inline void transform_compose(Transform *tfm, const Transform *decomp
 	tfm->x = make_float4(dot(rotation_x, scale_x), dot(rotation_x, scale_y), dot(rotation_x, scale_z), decomp->y.x);
 	tfm->y = make_float4(dot(rotation_y, scale_x), dot(rotation_y, scale_y), dot(rotation_y, scale_z), decomp->y.y);
 	tfm->z = make_float4(dot(rotation_z, scale_x), dot(rotation_z, scale_y), dot(rotation_z, scale_z), decomp->y.z);
-	tfm->w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
 }
 
-/* Disabled for now, need arc-length parametrization for constant speed motion.
- * #define CURVED_MOTION_INTERPOLATE */
-
-ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionTransform *motion, float t)
+/* Interpolate from array of decomposed transforms. */
+ccl_device void transform_motion_array_interpolate(Transform *tfm,
+                                                   const ccl_global DecomposedTransform *motion,
+                                                   uint numsteps,
+                                                   float time)
 {
-	/* possible optimization: is it worth it adding a check to skip scaling?
-	 * it's probably quite uncommon to have scaling objects. or can we skip
-	 * just shearing perhaps? */
-	Transform decomp;
-
-#ifdef CURVED_MOTION_INTERPOLATE
-	/* 3 point bezier curve interpolation for position */
-	float3 Ppre = float4_to_float3(motion->pre_y);
-	float3 Pmid = float4_to_float3(motion->mid.y);
-	float3 Ppost = float4_to_float3(motion->post_y);
-
-	float3 Pcontrol = 2.0f*Pmid - 0.5f*(Ppre + Ppost);
-	float3 P = Ppre*t*t + Pcontrol*2.0f*t*(1.0f - t) + Ppost*(1.0f - t)*(1.0f - t);
-
-	decomp.y.x = P.x;
-	decomp.y.y = P.y;
-	decomp.y.z = P.z;
-#endif
-
-	/* linear interpolation for rotation and scale */
-	if(t < 0.5f) {
-		t *= 2.0f;
-
-		decomp.x = quat_interpolate(motion->pre_x, motion->mid.x, t);
-#ifdef CURVED_MOTION_INTERPOLATE
-		decomp.y.w = (1.0f - t)*motion->pre_y.w + t*motion->mid.y.w;
-#else
-		decomp.y = (1.0f - t)*motion->pre_y + t*motion->mid.y;
-#endif
-	}
-	else {
-		t = (t - 0.5f)*2.0f;
-
-		decomp.x = quat_interpolate(motion->mid.x, motion->post_x, t);
-#ifdef CURVED_MOTION_INTERPOLATE
-		decomp.y.w = (1.0f - t)*motion->mid.y.w + t*motion->post_y.w;
-#else
-		decomp.y = (1.0f - t)*motion->mid.y + t*motion->post_y;
-#endif
-	}
-
-	decomp.z = motion->mid.z;
-	decomp.w = motion->mid.w;
-
-	/* compose rotation, translation, scale into matrix */
+	/* Figure out which steps we need to interpolate. */
+	int maxstep = numsteps-1;
+	int step = min((int)(time*maxstep), maxstep-1);
+	float t = time*maxstep - step;
+
+	const ccl_global DecomposedTransform *a = motion + step;
+	const ccl_global DecomposedTransform *b = motion + step + 1;
+
+	/* Interpolate rotation, translation and scale. */
+	DecomposedTransform decomp;
+	decomp.x = quat_interpolate(a->x, b->x, t);
+	decomp.y = (1.0f - t)*a->y + t*b->y;
+	decomp.z = (1.0f - t)*a->z + t*b->z;
+	decomp.w = (1.0f - t)*a->w + t*b->w;
+
+	/* Compose rotation, translation, scale into matrix. */
 	transform_compose(tfm, &decomp);
 }
 
@@ -520,13 +426,13 @@ ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionT
 
 class BoundBox2D;
 
-ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransform& B)
+ccl_device_inline bool operator==(const DecomposedTransform& A, const DecomposedTransform& B)
 {
-	return (A.pre == B.pre && A.post == B.post);
+	return memcmp(&A, &B, sizeof(DecomposedTransform)) == 0;
 }
 
 float4 transform_to_quat(const Transform& tfm);
-void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid);
+void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size);
 Transform transform_from_viewplane(BoundBox2D& viewplane);
 
 #endif
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a000fae4bd6..84206a7ba5a 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -18,72 +18,20 @@
 #define __UTIL_TYPES_H__
 
 #ifndef __KERNEL_OPENCL__
-
-#include <stdlib.h>
-
-#endif
-
-/* Bitness */
-
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#define __KERNEL_64_BIT__
-#endif
-
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-
-#define ccl_device static inline
-#define ccl_device_noinline static
-#define ccl_global
-#define ccl_constant
-#define ccl_restrict __restrict
-#define __KERNEL_WITH_SSE_ALIGN__
-
-#if defined(_WIN32) && !defined(FREE_WINDOWS)
-#define ccl_device_inline static __forceinline
-#define ccl_device_forceinline static __forceinline
-#define ccl_align(...) __declspec(align(__VA_ARGS__))
-#ifdef __KERNEL_64_BIT__
-#define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#else
-#undef __KERNEL_WITH_SSE_ALIGN__
-#define ccl_try_align(...) /* not support for function arguments (error C2719) */
-#endif
-#define ccl_may_alias
-#define ccl_always_inline __forceinline
-#define ccl_maybe_unused
-
-#else
-
-#define ccl_device_inline static inline __attribute__((always_inline))
-#define ccl_device_forceinline static inline __attribute__((always_inline))
-#define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#ifndef FREE_WINDOWS64
-#define __forceinline inline __attribute__((always_inline))
-#endif
-#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#define ccl_may_alias __attribute__((__may_alias__))
-#define ccl_always_inline __attribute__((always_inline))
-#define ccl_maybe_unused __attribute__((used))
-
-#endif
-
+#  include <stdlib.h>
 #endif
 
 /* Standard Integer Types */
 
-#ifndef __KERNEL_GPU__
-
-/* int8_t, uint16_t, and friends */
-#ifndef _WIN32
-#include <stdint.h>
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#  include <stdint.h>
 #endif
 
-/* SIMD Types */
-
-#include "util_optimization.h"
+#include "util/util_defines.h"
 
+#ifndef __KERNEL_GPU__
+#  include "util/util_optimization.h"
+#  include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -97,18 +45,19 @@ CCL_NAMESPACE_BEGIN
 /* Shorter Unsigned Names */
 
 #ifndef __KERNEL_OPENCL__
-
 typedef unsigned char uchar;
 typedef unsigned int uint;
-
+typedef unsigned short ushort;
 #endif
 
-#ifndef __KERNEL_GPU__
-
 /* Fixed Bits Types */
 
-#ifdef _WIN32
+#ifdef __KERNEL_OPENCL__
+typedef ulong uint64_t;
+#endif
 
+#ifndef __KERNEL_GPU__
+#  ifdef _WIN32
 typedef signed char int8_t;
 typedef unsigned char uint8_t;
 
@@ -120,440 +69,85 @@ typedef unsigned int uint32_t;
 
 typedef long long int64_t;
 typedef unsigned long long uint64_t;
-
-#ifdef __KERNEL_64_BIT__
+#    ifdef __KERNEL_64_BIT__
 typedef int64_t ssize_t;
-#else
+#    else
 typedef int32_t ssize_t;
-#endif
-
-#endif
+#    endif
+#  endif  /* _WIN32 */
 
 /* Generic Memory Pointer */
 
 typedef uint64_t device_ptr;
+#endif  /* __KERNEL_GPU__ */
 
-/* Vector Types */
-
-struct uchar2 {
-	uchar x, y;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar3 {
-	uchar x, y, z;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar4 {
-	uchar x, y, z, w;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct int2 {
-	int x, y;
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int3 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128i m128;
-		struct { int x, y, z, w; };
-	};
-
-	__forceinline int3() {}
-	__forceinline int3(const __m128i a) : m128(a) {}
-	__forceinline operator const __m128i&(void) const { return m128; }
-	__forceinline operator __m128i&(void) { return m128; }
-
-	int3(const int3& a) { m128 = a.m128; }
-	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
-#else
-	int x, y, z, w;
-#endif
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int4 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128i m128;
-		struct { int x, y, z, w; };
-	};
-
-	__forceinline int4() {}
-	__forceinline int4(const __m128i a) : m128(a) {}
-	__forceinline operator const __m128i&(void) const { return m128; }
-	__forceinline operator __m128i&(void) { return m128; }
-
-	int4(const int4& a) : m128(a.m128) {}
-	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
-#else
-	int x, y, z, w;
-#endif
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct uint2 {
-	uint x, y;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint3 {
-	uint x, y, z;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint4 {
-	uint x, y, z, w;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct float2 {
-	float x, y;
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float3 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128 m128;
-		struct { float x, y, z, w; };
-	};
-
-	__forceinline float3() {}
-	__forceinline float3(const __m128& a) : m128(a) {}
-	__forceinline operator const __m128&(void) const { return m128; }
-	__forceinline operator __m128&(void) { return m128; }
-
-	__forceinline float3(const float3& a) : m128(a.m128) {}
-	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
-#else
-	float x, y, z, w;
-#endif
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float4 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128 m128;
-		struct { float x, y, z, w; };
-	};
-
-	__forceinline float4() {}
-	__forceinline float4(const __m128 a) : m128(a) {}
-	__forceinline operator const __m128&(void) const { return m128; }
-	__forceinline operator __m128&(void) { return m128; }
-
-	__forceinline float4(const float4& a) : m128(a.m128) {}
-	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
-
-#else
-	float x, y, z, w;
-#endif
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-template<typename T>
-class vector3
-{
-public:
-	T x, y, z;
-
-	ccl_always_inline vector3() {}
-	ccl_always_inline vector3(const T& a)
-	  : x(a), y(a), z(a) {}
-	ccl_always_inline vector3(const T& x, const T& y, const T& z)
-	  : x(x), y(y), z(z) {}
-};
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-/* Vector Type Constructors
- * 
- * OpenCL does not support C++ class, so we use these instead. */
-
-ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
-{
-	uchar2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
-{
-	uchar3 a = {x, y, z};
-	return a;
-}
-
-ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
-{
-	uchar4 a = {x, y, z, w};
-	return a;
-}
-
-ccl_device_inline int2 make_int2(int x, int y)
-{
-	int2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline int3 make_int3(int x, int y, int z)
-{
-#ifdef __KERNEL_SSE__
-	int3 a;
-	a.m128 = _mm_set_epi32(0, z, y, x);
-#else
-	int3 a = {x, y, z, 0};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
-{
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_set_epi32(w, z, y, x);
-#else
-	int4 a = {x, y, z, w};
-#endif
-
-	return a;
-}
-
-ccl_device_inline uint2 make_uint2(uint x, uint y)
-{
-	uint2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
-{
-	uint3 a = {x, y, z};
-	return a;
-}
-
-ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
-{
-	uint4 a = {x, y, z, w};
-	return a;
-}
-
-ccl_device_inline float2 make_float2(float x, float y)
-{
-	float2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline float3 make_float3(float x, float y, float z)
-{
-#ifdef __KERNEL_SSE__
-	float3 a;
-	a.m128 = _mm_set_ps(0.0f, z, y, x);
-#else
-	float3 a = {x, y, z, 0.0f};
-#endif
-
-	return a;
-}
-
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_set_ps(w, z, y, x);
-#else
-	float4 a = {x, y, z, w};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int align_up(int offset, int alignment)
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
 {
 	return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-ccl_device_inline int3 make_int3(int i)
+ccl_device_inline size_t divide_up(size_t x, size_t y)
 {
-#ifdef __KERNEL_SSE__
-	int3 a;
-	a.m128 = _mm_set1_epi32(i);
-#else
-	int3 a = {i, i, i, i};
-#endif
-
-	return a;
+	return (x + y - 1) / y;
 }
 
-ccl_device_inline int4 make_int4(int i)
+ccl_device_inline size_t round_up(size_t x, size_t multiple)
 {
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_set1_epi32(i);
-#else
-	int4 a = {i, i, i, i};
-#endif
-
-	return a;
+	return ((x + multiple - 1) / multiple) * multiple;
 }
 
-ccl_device_inline float3 make_float3(float f)
+ccl_device_inline size_t round_down(size_t x, size_t multiple)
 {
-#ifdef __KERNEL_SSE__
-	float3 a;
-	a.m128 = _mm_set1_ps(f);
-#else
-	float3 a = {f, f, f, f};
-#endif
-
-	return a;
+	return (x / multiple) * multiple;
 }
 
-ccl_device_inline float4 make_float4(float f)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_set1_ps(f);
-#else
-	float4 a = {f, f, f, f};
-#endif
-
-	return a;
-}
+CCL_NAMESPACE_END
 
-ccl_device_inline float4 make_float4(const int4& i)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_cvtepi32_ps(i.m128);
-#else
-	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
-#endif
+/* Vectorized types declaration. */
+#include "util/util_types_uchar2.h"
+#include "util/util_types_uchar3.h"
+#include "util/util_types_uchar4.h"
 
-	return a;
-}
+#include "util/util_types_int2.h"
+#include "util/util_types_int3.h"
+#include "util/util_types_int4.h"
 
-ccl_device_inline int4 make_int4(const float3& f)
-{
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_cvtps_epi32(f.m128);
-#else
-	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-#endif
+#include "util/util_types_uint2.h"
+#include "util/util_types_uint3.h"
+#include "util/util_types_uint4.h"
 
-	return a;
-}
+#include "util/util_types_float2.h"
+#include "util/util_types_float3.h"
+#include "util/util_types_float4.h"
 
-#endif
+#include "util/util_types_vector3.h"
 
-/* Interpolation types for textures
- * cuda also use texture space to store other objects */
-enum InterpolationType {
-	INTERPOLATION_NONE = -1,
-	INTERPOLATION_LINEAR = 0,
-	INTERPOLATION_CLOSEST = 1,
-	INTERPOLATION_CUBIC = 2,
-	INTERPOLATION_SMART = 3,
+/* Vectorized types implementation. */
+#include "util/util_types_uchar2_impl.h"
+#include "util/util_types_uchar3_impl.h"
+#include "util/util_types_uchar4_impl.h"
 
-	INTERPOLATION_NUM_TYPES,
-};
+#include "util/util_types_int2_impl.h"
+#include "util/util_types_int3_impl.h"
+#include "util/util_types_int4_impl.h"
 
-/* Extension types for textures.
- *
- * Defines how the image is extrapolated past its original bounds.
- */
-enum ExtensionType {
-	/* Cause the image to repeat horizontally and vertically. */
-	EXTENSION_REPEAT = 0,
-	/* Extend by repeating edge pixels of the image. */
-	EXTENSION_EXTEND = 1,
-	/* Clip to image size and set exterior pixels as transparent. */
-	EXTENSION_CLIP = 2,
-
-	EXTENSION_NUM_TYPES,
-};
-
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#  define LIKELY(x)       __builtin_expect(!!(x), 1)
-#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
-#else
-#  define LIKELY(x)       (x)
-#  define UNLIKELY(x)     (x)
-#endif
+#include "util/util_types_uint2_impl.h"
+#include "util/util_types_uint3_impl.h"
+#include "util/util_types_uint4_impl.h"
 
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-#  define HAS_CPP11_FEATURES
-#endif
+#include "util/util_types_float2_impl.h"
+#include "util/util_types_float3_impl.h"
+#include "util/util_types_float4_impl.h"
 
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
+#include "util/util_types_vector3_impl.h"
 
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type)  {  \
-	TYPEOF(var) *__tmp;         \
-	__tmp = (type *)NULL;         \
-	(void)__tmp;                  \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-	TYPEOF(var_a) *__tmp;                 \
-	__tmp = (typeof(var_b) *)NULL;        \
-	(void)__tmp;                          \
-} (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+#  include "util/util_sseb.h"
+#  include "util/util_ssei.h"
+#  include "util/util_ssef.h"
+#  include "util/util_avxf.h"
 #endif
 
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
-	((void)(((type)0) != (val)))
-
-
-CCL_NAMESPACE_END
-
 #endif /* __UTIL_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h
new file mode 100644
index 00000000000..ec7a1f717a1
--- /dev/null
+++ b/intern/cycles/util/util_types_float2.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_H__
+#define __UTIL_TYPES_FLOAT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct float2 {
+	float x, y;
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float2 make_float2(float x, float y);
+ccl_device_inline void print_float2(const char *label, const float2& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h
new file mode 100644
index 00000000000..782dda195eb
--- /dev/null
+++ b/intern/cycles/util/util_types_float2_impl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
+#define __UTIL_TYPES_FLOAT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline float float2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+__forceinline float& float2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline float2 make_float2(float x, float y)
+{
+	float2 a = {x, y};
+	return a;
+}
+
+ccl_device_inline void print_float2(const char *label, const float2& a)
+{
+	printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h
new file mode 100644
index 00000000000..28146ad04f7
--- /dev/null
+++ b/intern/cycles/util/util_types_float3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_H__
+#define __UTIL_TYPES_FLOAT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) float3 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float3();
+	__forceinline float3(const float3& a);
+	__forceinline explicit float3(const __m128& a);
+
+	__forceinline operator const __m128&(void) const;
+	__forceinline operator __m128&(void);
+
+	__forceinline float3& operator =(const float3& a);
+#else  /* __KERNEL_SSE__ */
+	float x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float3 make_float3(float f);
+ccl_device_inline float3 make_float3(float x, float y, float z);
+ccl_device_inline void print_float3(const char *label, const float3& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h
new file mode 100644
index 00000000000..45f61767d3f
--- /dev/null
+++ b/intern/cycles/util/util_types_float3_impl.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
+#define __UTIL_TYPES_FLOAT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float3::float3()
+{
+}
+
+__forceinline float3::float3(const float3& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline float3::float3(const __m128& a)
+        : m128(a)
+{
+}
+
+__forceinline float3::operator const __m128&(void) const
+{
+	return m128;
+}
+
+__forceinline float3::operator __m128&(void)
+{
+	return m128;
+}
+
+__forceinline float3& float3::operator =(const float3& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline float float3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline float& float3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+	float3 a(_mm_set1_ps(f));
+#else
+	float3 a = {f, f, f, f};
+#endif
+	return a;
+}
+
+ccl_device_inline float3 make_float3(float x, float y, float z)
+{
+#ifdef __KERNEL_SSE__
+	float3 a(_mm_set_ps(0.0f, z, y, x));
+#else
+	float3 a = {x, y, z, 0.0f};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_float3(const char *label, const float3& a)
+{
+	printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h
new file mode 100644
index 00000000000..154391f6881
--- /dev/null
+++ b/intern/cycles/util/util_types_float4.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_H__
+#define __UTIL_TYPES_FLOAT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int4;
+
+struct ccl_try_align(16) float4 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float4();
+	__forceinline explicit float4(const __m128& a);
+
+	__forceinline operator const __m128&(void) const;
+	__forceinline operator __m128&(void);
+
+	__forceinline float4& operator =(const float4& a);
+
+#else  /* __KERNEL_SSE__ */
+	float x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float4 make_float4(float f);
+ccl_device_inline float4 make_float4(float x, float y, float z, float w);
+ccl_device_inline float4 make_float4(const int4& i);
+ccl_device_inline void print_float4(const char *label, const float4& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h
new file mode 100644
index 00000000000..09f45f47d38
--- /dev/null
+++ b/intern/cycles/util/util_types_float4_impl.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
+#define __UTIL_TYPES_FLOAT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float4::float4()
+{
+}
+
+__forceinline float4::float4(const __m128& a)
+        : m128(a)
+{
+}
+
+__forceinline float4::operator const __m128&(void) const
+{
+	return m128;
+}
+
+__forceinline float4::operator __m128&(void)
+{
+	return m128;
+}
+
+__forceinline float4& float4::operator =(const float4& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline float float4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+__forceinline float& float4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_set1_ps(f));
+#else
+	float4 a = {f, f, f, f};
+#endif
+	return a;
+}
+
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_set_ps(w, z, y, x));
+#else
+	float4 a = {x, y, z, w};
+#endif
+	return a;
+}
+
+ccl_device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_cvtepi32_ps(i.m128));
+#else
+	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_float4(const char *label, const float4& a)
+{
+	printf("%s: %.8f %.8f %.8f %.8f\n",
+	       label, 
+	       (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int2.h b/intern/cycles/util/util_types_int2.h
new file mode 100644
index 00000000000..82e860f89eb
--- /dev/null
+++ b/intern/cycles/util/util_types_int2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_H__
+#define __UTIL_TYPES_INT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int2 {
+	int x, y;
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int2 make_int2(int x, int y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT2_H__ */
diff --git a/intern/cycles/util/util_types_int2_impl.h b/intern/cycles/util/util_types_int2_impl.h
new file mode 100644
index 00000000000..c7d3942e723
--- /dev/null
+++ b/intern/cycles/util/util_types_int2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_IMPL_H__
+#define __UTIL_TYPES_INT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+int int2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+int& int2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline int2 make_int2(int x, int y)
+{
+	int2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h
new file mode 100644
index 00000000000..9d43b201c02
--- /dev/null
+++ b/intern/cycles/util/util_types_int3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_H__
+#define __UTIL_TYPES_INT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) int3 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int3();
+	__forceinline int3(const int3& a);
+	__forceinline explicit int3(const __m128i& a);
+
+	__forceinline operator const __m128i&(void) const;
+	__forceinline operator __m128i&(void);
+
+	__forceinline int3& operator =(const int3& a);
+#else  /* __KERNEL_SSE__ */
+	int x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int3 make_int3(int i);
+ccl_device_inline int3 make_int3(int x, int y, int z);
+ccl_device_inline void print_int3(const char *label, const int3& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT3_H__ */
diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h
new file mode 100644
index 00000000000..ada50c4812c
--- /dev/null
+++ b/intern/cycles/util/util_types_int3_impl.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_IMPL_H__
+#define __UTIL_TYPES_INT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int3::int3()
+{
+}
+
+__forceinline int3::int3(const __m128i& a)
+        : m128(a)
+{
+}
+
+__forceinline int3::int3(const int3& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline int3::operator const __m128i&(void) const
+{
+	return m128;
+}
+
+__forceinline int3::operator __m128i&(void)
+{
+	return m128;
+}
+
+__forceinline int3& int3::operator =(const int3& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline int int3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline int& int3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+	int3 a(_mm_set1_epi32(i));
+#else
+	int3 a = {i, i, i, i};
+#endif
+	return a;
+}
+
+ccl_device_inline int3 make_int3(int x, int y, int z)
+{
+#ifdef __KERNEL_SSE__
+	int3 a(_mm_set_epi32(0, z, y, x));
+#else
+	int3 a = {x, y, z, 0};
+#endif
+
+	return a;
+}
+
+ccl_device_inline void print_int3(const char *label, const int3& a)
+{
+	printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h
new file mode 100644
index 00000000000..cdd0ecbdae5
--- /dev/null
+++ b/intern/cycles/util/util_types_int4.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_H__
+#define __UTIL_TYPES_INT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct float3;
+
+struct ccl_try_align(16) int4 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int4();
+	__forceinline int4(const int4& a);
+	__forceinline explicit int4(const __m128i& a);
+
+	__forceinline operator const __m128i&(void) const;
+	__forceinline operator __m128i&(void);
+
+	__forceinline int4& operator=(const int4& a);
+#else  /* __KERNEL_SSE__ */
+	int x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int4 make_int4(int i);
+ccl_device_inline int4 make_int4(int x, int y, int z, int w);
+ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline void print_int4(const char *label, const int4& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT4_H__ */
diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h
new file mode 100644
index 00000000000..07cdc88f2dc
--- /dev/null
+++ b/intern/cycles/util/util_types_int4_impl.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_IMPL_H__
+#define __UTIL_TYPES_INT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int4::int4()
+{
+}
+
+__forceinline int4::int4(const int4& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline int4::int4(const __m128i& a)
+        : m128(a)
+{
+}
+
+__forceinline int4::operator const __m128i&(void) const
+{
+	return m128;
+}
+
+__forceinline int4::operator __m128i&(void)
+{
+	return m128;
+}
+
+__forceinline int4& int4::operator=(const int4& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline int int4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+__forceinline int& int4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_set1_epi32(i));
+#else
+	int4 a = {i, i, i, i};
+#endif
+	return a;
+}
+
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_set_epi32(w, z, y, x));
+#else
+	int4 a = {x, y, z, w};
+#endif
+	return a;
+}
+
+ccl_device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_cvtps_epi32(f.m128));
+#else
+	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_int4(const char *label, const int4& a)
+{
+	printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar2.h b/intern/cycles/util/util_types_uchar2.h
new file mode 100644
index 00000000000..f618a2234ca
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_H__
+#define __UTIL_TYPES_UCHAR2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar2 {
+	uchar x, y;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR2_H__ */
diff --git a/intern/cycles/util/util_types_uchar2_impl.h b/intern/cycles/util/util_types_uchar2_impl.h
new file mode 100644
index 00000000000..d5f196d0ce0
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_IMPL_H__
+#define __UTIL_TYPES_UCHAR2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+uchar& uchar2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
+{
+	uchar2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar3.h b/intern/cycles/util/util_types_uchar3.h
new file mode 100644
index 00000000000..1e3644e6fd6
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_H__
+#define __UTIL_TYPES_UCHAR3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar3 {
+	uchar x, y, z;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR3_H__ */
diff --git a/intern/cycles/util/util_types_uchar3_impl.h b/intern/cycles/util/util_types_uchar3_impl.h
new file mode 100644
index 00000000000..611021efb7f
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_IMPL_H__
+#define __UTIL_TYPES_UCHAR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+uchar& uchar3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
+{
+	uchar3 a = {x, y, z};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar4.h b/intern/cycles/util/util_types_uchar4.h
new file mode 100644
index 00000000000..3802cebbfb9
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_H__
+#define __UTIL_TYPES_UCHAR4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar4 {
+	uchar x, y, z, w;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR4_H__ */
diff --git a/intern/cycles/util/util_types_uchar4_impl.h b/intern/cycles/util/util_types_uchar4_impl.h
new file mode 100644
index 00000000000..03039f60c54
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_IMPL_H__
+#define __UTIL_TYPES_UCHAR4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+uchar& uchar4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+{
+	uchar4 a = {x, y, z, w};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint2.h b/intern/cycles/util/util_types_uint2.h
new file mode 100644
index 00000000000..c4a31899614
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_H__
+#define __UTIL_TYPES_UINT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint2 {
+	uint x, y;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint2 make_uint2(uint x, uint y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT2_H__ */
diff --git a/intern/cycles/util/util_types_uint2_impl.h b/intern/cycles/util/util_types_uint2_impl.h
new file mode 100644
index 00000000000..b50ffa2667f
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_IMPL_H__
+#define __UTIL_TYPES_UINT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint2::operator[](uint i) const
+{
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+__forceinline uint& uint2::operator[](uint i)
+{
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline uint2 make_uint2(uint x, uint y)
+{
+	uint2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint3.h b/intern/cycles/util/util_types_uint3.h
new file mode 100644
index 00000000000..aeeecd2df06
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_H__
+#define __UTIL_TYPES_UINT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint3 {
+	uint x, y, z;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT3_H__ */
diff --git a/intern/cycles/util/util_types_uint3_impl.h b/intern/cycles/util/util_types_uint3_impl.h
new file mode 100644
index 00000000000..26005d5baff
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_IMPL_H__
+#define __UTIL_TYPES_UINT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint3::operator[](uint i) const
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline uint& uint3::operator[](uint i)
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
+{
+	uint3 a = {x, y, z};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint4.h b/intern/cycles/util/util_types_uint4.h
new file mode 100644
index 00000000000..2d3a7bb85e4
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_H__
+#define __UTIL_TYPES_UINT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint4 {
+	uint x, y, z, w;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT4_H__ */
diff --git a/intern/cycles/util/util_types_uint4_impl.h b/intern/cycles/util/util_types_uint4_impl.h
new file mode 100644
index 00000000000..6d48131a446
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_IMPL_H__
+#define __UTIL_TYPES_UINT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint4::operator[](uint i) const
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline uint& uint4::operator[](uint i)
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
+{
+	uint4 a = {x, y, z, w};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_vector3.h b/intern/cycles/util/util_types_vector3.h
new file mode 100644
index 00000000000..12acf9dc959
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_H__
+#define __UTIL_TYPES_VECTOR3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+class vector3
+{
+public:
+	T x, y, z;
+
+	__forceinline vector3();
+	__forceinline vector3(const T& a);
+	__forceinline vector3(const T& x, const T& y, const T& z);
+};
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_VECTOR3_H__ */
diff --git a/intern/cycles/util/util_types_vector3_impl.h b/intern/cycles/util/util_types_vector3_impl.h
new file mode 100644
index 00000000000..2f6b8368540
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3_impl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__
+#define __UTIL_TYPES_VECTOR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+ccl_always_inline vector3<T>::vector3()
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& a)
+        : x(a), y(a), z(a)
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& x, const T& y, const T& z)
+        : x(x), y(y), z(z)
+{
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_VECTOR3_IMPL_H__ */
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 546b17570bb..e98e4e34181 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -23,9 +23,9 @@
 #include <cstring>
 #include <vector>
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
-#include "util_types.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -86,9 +86,9 @@ public:
  *   this was actually showing up in profiles quite significantly. it
  *   also does not run any constructors/destructors
  * - if this is used, we are not tempted to use inefficient operations
- * - aligned allocation for SSE data types */
+ * - aligned allocation for CPU native data types */
 
-template<typename T, size_t alignment = 16>
+template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES>
 class array
 {
 public:
@@ -162,6 +162,11 @@ public:
 		return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0;
 	}
 
+	bool operator!=(const array<T>& other) const
+	{
+		return !(*this == other);
+	}
+
 	void steal_data(array& from)
 	{
 		if(this != &from) {
@@ -177,6 +182,14 @@ public:
 		}
 	}
 
+	T *steal_pointer()
+	{
+		T *ptr = data_;
+		data_ = NULL;
+		clear();
+		return ptr;
+	}
+
 	T* resize(size_t newsize)
 	{
 		if(newsize == 0) {
@@ -202,6 +215,18 @@ public:
 		return data_;
 	}
 
+	T* resize(size_t newsize, const T& value)
+	{
+		size_t oldsize = size();
+		resize(newsize);
+
+		for(size_t i = oldsize; i < size(); i++) {
+			data_[i] = value;
+		}
+
+		return data_;
+	}
+
 	void clear()
 	{
 		if(data_ != NULL) {
@@ -273,6 +298,15 @@ public:
 		push_back_slow(t);
 	}
 
+	void append(const array<T>& from)
+	{
+		if(from.size()) {
+			size_t old_size = size();
+			resize(old_size + from.size());
+			memcpy(data_ + old_size, from.data(), sizeof(T) * from.size());
+		}
+	}
+
 protected:
 	inline T* mem_allocate(size_t N)
 	{
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index d609c739ac7..112255f447b 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -22,8 +22,8 @@
 CCL_NAMESPACE_BEGIN
 
 #define CYCLES_VERSION_MAJOR    1
-#define CYCLES_VERSION_MINOR    8
-#define CYCLES_VERSION_PATCH    1
+#define CYCLES_VERSION_MINOR    9
+#define CYCLES_VERSION_PATCH    0
 
 #define CYCLES_MAKE_VERSION_STRING2(a,b,c) #a "." #b "." #c
 #define CYCLES_MAKE_VERSION_STRING(a,b,c) CYCLES_MAKE_VERSION_STRING2(a,b,c)
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 9796a5f896d..10d86167921 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -17,11 +17,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "util_opengl.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_version.h"
-#include "util_view.h"
+#include "util/util_opengl.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_version.h"
+#include "util/util_view.h"
 
 #ifdef __APPLE__
 #include <GLUT/glut.h>
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
index 4de8483564b..073db2a27db 100644
--- a/intern/cycles/util/util_windows.cpp
+++ b/intern/cycles/util/util_windows.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 
diff --git a/intern/cycles/util/util_xml.h b/intern/cycles/util/util_xml.h
index cfd0afc95f7..6f06f17937b 100644
--- a/intern/cycles/util/util_xml.h
+++ b/intern/cycles/util/util_xml.h
@@ -25,7 +25,17 @@ CCL_NAMESPACE_BEGIN
 
 OIIO_NAMESPACE_USING
 
+#ifdef WITH_SYSTEM_PUGIXML
+#  define PUGIXML_NAMESPACE pugi
+#else
+#  define PUGIXML_NAMESPACE OIIO_NAMESPACE::pugi
+#endif
+
+using PUGIXML_NAMESPACE::xml_attribute;
+using PUGIXML_NAMESPACE::xml_document;
+using PUGIXML_NAMESPACE::xml_node;
+using PUGIXML_NAMESPACE::xml_parse_result;
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_XML_H__ */
-