410 files changed, 23025 insertions, 7739 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index ae38fbb2934..5b68da79623 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -14,8 +14,19 @@ include(cmake/external_libs.cmake)
 # todo: refactor this code to match scons
 # note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
 
-if(WIN32 AND MSVC)
+if(WITH_CYCLES_WERROR)
+	ADD_CHECK_C_COMPILER_FLAG(CMAKE_C_FLAGS C_WERROR -Werror)
+	ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS C_WERROR -Werror)
+endif()
+
+if(NOT WITH_CPU_SSE)
+	set(CXX_HAS_SSE FALSE)
+	set(CXX_HAS_AVX FALSE)
+	set(CXX_HAS_AVX2 FALSE)
+elseif(WIN32 AND MSVC)
 	set(CXX_HAS_SSE TRUE)
+	set(CXX_HAS_AVX TRUE)
+	set(CXX_HAS_AVX2 TRUE)
 
 	# /arch:AVX for VC2012 and above
 	if(NOT MSVC_VERSION LESS 1700)
@@ -47,22 +58,34 @@ if(WIN32 AND MSVC)
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
 elseif(CMAKE_COMPILER_IS_GNUCC)
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
+	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
+	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
 		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
+	endif()
+	if(CXX_HAS_AVX)
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mfpmath=sse")
+	endif()
+	if(CXX_HAS_AVX2)
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
+	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
+	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
 		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
+	endif()
+	if(CXX_HAS_AVX)
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
+	endif()
+	if(CXX_HAS_AVX2)
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
@@ -72,11 +95,17 @@ if(CXX_HAS_SSE)
 		-DWITH_KERNEL_SSE2
 		-DWITH_KERNEL_SSE3
 		-DWITH_KERNEL_SSE41
-		-DWITH_KERNEL_AVX
-		-DWITH_KERNEL_AVX2
 	)
 endif()
 
+if(CXX_HAS_AVX)
+	add_definitions(-DWITH_KERNEL_AVX)
+endif()
+
+if(CXX_HAS_AVX2)
+	add_definitions(-DWITH_KERNEL_AVX2)
+endif()
+
 if(WITH_CYCLES_OSL)
 	if(WIN32 AND MSVC)
 		set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
@@ -118,7 +147,7 @@ if(WITH_CYCLES_OSL)
 	add_definitions(-DOSL_STATIC_LIBRARY)
 	include_directories(
 		SYSTEM
-		${OSL_INCLUDES}
+		${OSL_INCLUDE_DIR}
 	)
 endif()
 
@@ -128,22 +157,42 @@ add_definitions(
 	-DWITH_MULTI
 )
 
+TEST_UNORDERED_MAP_SUPPORT()
+if(HAVE_STD_UNORDERED_MAP_HEADER)
+	if(HAVE_UNORDERED_MAP_IN_STD_NAMESPACE)
+		add_definitions(-DCYCLES_STD_UNORDERED_MAP)
+	else()
+		if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
+			add_definitions(-DCYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
+		else()
+			add_definitions(-DCYCLES_NO_UNORDERED_MAP)
+			message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
+		endif()
+	endif()
+else()
+	if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
+		add_definitions(-DCYCLES_TR1_UNORDERED_MAP)
+	else()
+		add_definitions(-DCYCLES_NO_UNORDERED_MAP)
+		message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
+	endif()
+endif()
+
+# Logging capabilities using GLog library.
 if(WITH_CYCLES_LOGGING)
 	add_definitions(-DWITH_CYCLES_LOGGING)
 	add_definitions(-DGOOGLE_GLOG_DLL_DECL=)
-	if(WIN32)
-		include_directories(
-			SYSTEM
-			../../extern/libmv/third_party/glog/src/windows
-			../../extern/libmv/third_party/gflags
-		)
-	else()
-		include_directories(
-			SYSTEM
-			../../extern/libmv/third_party/glog/src
-			../../extern/libmv/third_party/gflags
-		)
-	endif()
+	add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE})
+	include_directories(
+		SYSTEM
+		${GLOG_INCLUDE_DIRS}
+		${GFLAGS_INCLUDE_DIRS}
+	)
+endif()
+
+# Debugging capabilities (debug passes etc).
+if(WITH_CYCLES_DEBUG)
+	add_definitions(-DWITH_CYCLES_DEBUG)
 endif()
 
 if(WITH_LIBMV)
@@ -160,13 +209,21 @@ include_directories(
 	${OPENIMAGEIO_INCLUDE_DIRS}/OpenImageIO
 	${OPENEXR_INCLUDE_DIR}
 	${OPENEXR_INCLUDE_DIRS}
+	${PUGIXML_INCLUDE_DIR}
 )
 
+if(CYCLES_STANDALONE_REPOSITORY)
+	include_directories(../third_party/atomic)
+else()
+	include_directories(../atomic)
+endif()
 
 # Warnings
 if(CMAKE_COMPILER_IS_GNUCXX)
 	ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
+	ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion")
 	unset(_has_cxxflag_float_conversion)
+	unset(_has_cxxflag_double_promotion)
 endif()
 
 
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 58685f599a5..11af440dfe8 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -34,18 +34,26 @@ cycles.Depends('../../source/blender/makesrna/intern/RNA_blender_cpp.h', 'makesr
 
 sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('kernel/*.cpp') + cycles.Glob('render/*.cpp') + cycles.Glob('subd/*.cpp') + cycles.Glob('util/*.cpp') + cycles.Glob('blender/*.cpp')
 
+sources.append(path.join('kernel', 'kernels', 'cpu', 'kernel.cpp'))
 sources.remove(path.join('util', 'util_view.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
-sources.remove(path.join('kernel', 'kernel_avx.cpp'))
-sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
 
 incs = [] 
 defs = []
 cxxflags = Split(env['CXXFLAGS'])
 
-defs.append('GLEW_STATIC')
+defs += env['BF_GL_DEFINITIONS']
+
+if env['WITH_UNORDERED_MAP_SUPPORT']:
+    if env['UNORDERED_MAP_HEADER'] == 'unordered_map':
+        if env['UNORDERED_MAP_NAMESPACE'] == 'std':
+            defs.append('CYCLES_STD_UNORDERED_MAP')
+        elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+            defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE')
+    elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+        defs.append('CYCLES_TR1_UNORDERED_MAP')
+else:
+    print("-- Replacing unordered_map/set with map/set (warning: slower!)")
+    defs.append('CYCLES_NO_UNORDERED_MAP')
 
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
@@ -63,10 +71,30 @@ if env['WITH_BF_LIBMV']:
     defs.append('WITH_CYCLES_DISTORTION')
     incs.append('#extern/libmv')
 
+if env['WITH_BF_CYCLES_DEBUG']:
+    defs.append('WITH_CYCLES_DEBUG')
+
+if env['WITH_BF_CYCLES_LOGGING']:
+    defs.append('WITH_CYCLES_LOGGING')
+    defs.append('GOOGLE_GLOG_DLL_DECL=')
+    defs.append('CYCLES_GFLAGS_NAMESPACE=gflags')
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+        incs.append('#extern/libmv/third_party/glog/src/windows')
+        incs.append('#extern/libmv/third_party/gflags')
+    else:
+        incs.append('#extern/libmv/third_party/glog/src')
+        incs.append('#extern/libmv/third_party/gflags')
+
 incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
 incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna #source/blender/blenlib'.split())
 incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
+
+incs.append(env['BF_GLEW_INC'])
+incs.append('#/intern/glew-mx')
+incs.append('#/intern/atomic')
+incs.append('#intern/mikktspace')
 incs.extend('#extern/glew/include #extern/clew/include #extern/cuew/include #intern/mikktspace'.split())
+
 incs.append(cycles['BF_OIIO_INC'])
 incs.append(cycles['BF_BOOST_INC'])
 incs.append(cycles['BF_OPENEXR_INC'].split())
@@ -112,13 +140,13 @@ else:
 
     if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
         kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
-        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2'
+        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c'
 
 for kernel_type in kernel_flags.keys():
     defs.append('WITH_KERNEL_' + kernel_type.upper())
 
 for kernel_type in kernel_flags.keys():
-    kernel_source = path.join('kernel', 'kernel_' + kernel_type + '.cpp')
+    kernel_source = path.join('kernel', 'kernels', 'cpu', 'kernel_' + kernel_type + '.cpp')
     kernel_cxxflags = Split(env['CXXFLAGS'])
     kernel_cxxflags.append(kernel_flags[kernel_type].split())
     kernel_defs = defs[:]
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 5876ac3779c..b000266cac2 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -12,6 +12,8 @@ set(INC
 set(INC_SYS
 )
 
+# NOTE: LIBRARIES contains all the libraries which are common
+# across release and debug build types, stored in a linking order.
 set(LIBRARIES
 	cycles_device
 	cycles_kernel
@@ -19,36 +21,85 @@ set(LIBRARIES
 	cycles_bvh
 	cycles_subd
 	cycles_util
-	${BOOST_LIBRARIES}
-	${OPENEXR_LIBRARIES}
-	${OPENGL_LIBRARIES}
-	${CYCLES_GLEW_LIBRARY}
-	${OPENIMAGEIO_LIBRARIES}
+	${BLENDER_GL_LIBRARIES}
+	${CYCLES_APP_GLEW_LIBRARY}
 	${PNG_LIBRARIES}
 	${JPEG_LIBRARIES}
 	${ZLIB_LIBRARIES}
 	${TIFF_LIBRARY}
+	${PTHREADS_LIBRARIES}
 	extern_clew
 	extern_cuew
 )
 
-if(WIN32)
-	list(APPEND LIBRARIES ${PTHREADS_LIBRARIES})
+if(WITH_CYCLES_OSL)
+	list(APPEND LIBRARIES cycles_kernel_osl)
 endif()
 
-link_directories(${OPENIMAGEIO_LIBPATH} ${BOOST_LIBPATH} ${PNG_LIBPATH} ${JPEG_LIBPATH} ${ZLIB_LIBPATH} ${TIFF_LIBPATH})
+if(CYCLES_STANDALONE_REPOSITORY)
+	if(WITH_CYCLES_LOGGING)
+		list(APPEND LIBRARIES
+			${GLOG_LIBRARIES}
+			${GFLAGS_LIBRARIES}
+		)
+	endif()
+else()
+	list(APPEND LIBRARIES bf_intern_glew_mx)
+	if(WITH_CYCLES_LOGGING)
+		list(APPEND LIBRARIES extern_glog)
+	endif()
+endif()
 
 if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
 	list(APPEND LIBRARIES ${GLUT_LIBRARIES})
 endif()
 
-if(WITH_CYCLES_OSL)
-	list(APPEND LIBRARIES cycles_kernel_osl ${OSL_LIBRARIES} ${LLVM_LIBRARY})
-endif()
+# Common configuration.
+
+link_directories(${OPENIMAGEIO_LIBPATH}
+                 ${BOOST_LIBPATH}
+                 ${PNG_LIBPATH}
+                 ${JPEG_LIBPATH}
+                 ${ZLIB_LIBPATH}
+                 ${TIFF_LIBPATH}
+                 ${OPENEXR_LIBPATH})
+
+add_definitions(${GL_DEFINITIONS})
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+# Make sure given target is linked against proper libraries
+# which varies across debug and release build types.
+#
+# This will also make sure dependencies of that libraries
+# are sent to the linker after them.
+#
+# TODO(sergey): Think of a better place for this?
+macro(cycles_target_link_libraries target)
+	target_link_libraries(${target} ${LIBRARIES})
+	if(WITH_CYCLES_OSL)
+		target_link_libraries_decoupled(${target} OSL_LIBRARIES)
+		if(MSVC)
+			target_link_libraries_debug(${target} "${LLVM_LIBRARIES_DEBUG}")
+			target_link_libraries_optimized(${target} "${LLVM_LIBRARIES}")
+		else()
+			target_link_libraries(${target} ${LLVM_LIBRARIES})
+		endif()
+	endif()
+	target_link_libraries_decoupled(${target} OPENIMAGEIO_LIBRARIES)
+	target_link_libraries_decoupled(${target} OPENEXR_LIBRARIES)
+	target_link_libraries(
+		${target}
+		${PUGIXML_LIBRARIES}
+		${BOOST_LIBRARIES}
+		${CMAKE_DL_LIBS}
+		${PLATFORM_LINKLIBS}
+	)
+endmacro()
+
+# Application build targets
+
 if(WITH_CYCLES_STANDALONE)
 	set(SRC
 		cycles_standalone.cpp
@@ -56,8 +107,7 @@ if(WITH_CYCLES_STANDALONE)
 		cycles_xml.h
 	)
 	add_executable(cycles ${SRC})
-	list(APPEND LIBRARIES ${PLATFORM_LINKLIBS})
-	target_link_libraries(cycles ${LIBRARIES} ${CMAKE_DL_LIBS})
+	cycles_target_link_libraries(cycles)
 
 	if(UNIX AND NOT APPLE)
 		set_target_properties(cycles PROPERTIES INSTALL_RPATH $ORIGIN/lib)
@@ -70,11 +120,10 @@ if(WITH_CYCLES_NETWORK)
 		cycles_server.cpp
 	)
 	add_executable(cycles_server ${SRC})
-	target_link_libraries(cycles_server ${LIBRARIES} ${CMAKE_DL_LIBS})
+	cycles_target_link_libraries(cycles_server)
 
 	if(UNIX AND NOT APPLE)
 		set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib)
 	endif()
 	unset(SRC)
 endif()
-
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index f4cacb2d001..4ef9cd070bb 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -24,18 +24,20 @@
 #include "util_stats.h"
 #include "util_string.h"
 #include "util_task.h"
+#include "util_logging.h"
 
 using namespace ccl;
 
 int main(int argc, const char **argv)
 {
+	util_logging_init(argv[0]);
 	path_init();
 
 	/* device types */
 	string devicelist = "";
 	string devicename = "cpu";
-	bool list = false;
-	int threads = 0;
+	bool list = false, debug = false;
+	int threads = 0, verbosity = 1;
 
 	vector<DeviceType>& types = Device::available_types();
 
@@ -53,6 +55,10 @@ int main(int argc, const char **argv)
 		"--device %s", &devicename, ("Devices to use: " + devicelist).c_str(),
 		"--list-devices", &list, "List information about all available devices",
 		"--threads %d", &threads, "Number of threads to use for CPU device",
+#ifdef WITH_CYCLES_LOGGING
+		"--debug", &debug, "Enable debug logging",
+		"--verbose %d", &verbosity, "Set verbosity of the logger",
+#endif
 		NULL);
 
 	if(ap.parse(argc, argv) < 0) {
@@ -60,7 +66,13 @@ int main(int argc, const char **argv)
 		ap.usage();
 		exit(EXIT_FAILURE);
 	}
-	else if(list) {
+
+	if(debug) {
+		util_logging_start();
+		util_logging_verbosity_set(verbosity);
+	}
+
+	if(list) {
 		vector<DeviceInfo>& devices = Device::available_devices();
 
 		printf("Devices:\n");
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 90333eb3fc5..b0d49d6ee72 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -21,10 +21,12 @@
 #include "device.h"
 #include "scene.h"
 #include "session.h"
+#include "integrator.h"
 
 #include "util_args.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_path.h"
 #include "util_progress.h"
 #include "util_string.h"
@@ -70,12 +72,12 @@ static void session_print(const string& str)
 static void session_print_status()
 {
 	int sample, tile;
-	double total_time, sample_time;
+	double total_time, sample_time, render_time;
 	string status, substatus;
 
 	/* get status */
 	sample = options.session->progress.get_sample();
-	options.session->progress.get_tile(tile, total_time, sample_time);
+	options.session->progress.get_tile(tile, total_time, sample_time, render_time);
 	options.session->progress.get_status(status, substatus);
 
 	if(substatus != "")
@@ -123,7 +125,7 @@ static void scene_init()
 	xml_read_file(options.scene, options.filepath.c_str());
 
 	/* Camera width/height override? */
-	if (!(options.width == 0 || options.height == 0)) {
+	if(!(options.width == 0 || options.height == 0)) {
 		options.scene->camera->width = options.width;
 		options.scene->camera->height = options.height;
 	}
@@ -165,11 +167,11 @@ static void display_info(Progress& progress)
 	last = elapsed;
 
 	int sample, tile;
-	double total_time, sample_time;
+	double total_time, sample_time, render_time;
 	string status, substatus;
 
 	sample = progress.get_sample();
-	progress.get_tile(tile, total_time, sample_time);
+	progress.get_tile(tile, total_time, sample_time, render_time);
 	progress.get_status(status, substatus);
 
 	if(substatus != "")
@@ -271,6 +273,7 @@ static void keyboard(unsigned char key)
 	else if(key == 'i')
 		options.interactive = !(options.interactive);
 
+	/* Navigation */
 	else if(options.interactive && (key == 'w' || key == 'a' || key == 's' || key == 'd')) {
 		Transform matrix = options.session->scene->camera->matrix;
 		float3 translate;
@@ -293,6 +296,25 @@ static void keyboard(unsigned char key)
 
 		options.session->reset(session_buffer_params(), options.session_params.samples);
 	}
+
+	/* Set Max Bounces */
+	else if(options.interactive && (key == '0' || key == '1' || key == '2' || key == '3')) {
+		int bounce;
+		switch(key) {
+			case '0': bounce = 0; break;
+			case '1': bounce = 1; break;
+			case '2': bounce = 2; break;
+			case '3': bounce = 3; break;
+			default: bounce = 0; break;
+		}
+
+		options.session->scene->integrator->max_bounce = bounce;
+
+		/* Update and Reset */
+		options.session->scene->integrator->need_update = true;
+
+		options.session->reset(session_buffer_params(), options.session_params.samples);
+	}
 }
 #endif
 
@@ -319,6 +341,11 @@ static void options_parse(int argc, const char **argv)
 
 	vector<DeviceType>& types = Device::available_types();
 
+	/* TODO(sergey): Here's a feedback loop happens: on the one hand we want
+	 * the device list to be printed in help message, on the other hand logging
+	 * is not initialized yet so we wouldn't have debug log happening in the
+	 * device initialization.
+	 */
 	foreach(DeviceType type, types) {
 		if(device_names != "")
 			device_names += ", ";
@@ -331,7 +358,8 @@ static void options_parse(int argc, const char **argv)
 
 	/* parse options */
 	ArgParse ap;
-	bool help = false;
+	bool help = false, debug = false;
+	int verbosity = 1;
 
 	ap.options ("Usage: cycles [options] file.xml",
 		"%*", files_parse, "",
@@ -347,6 +375,10 @@ static void options_parse(int argc, const char **argv)
 		"--width  %d", &options.width, "Window width in pixel",
 		"--height %d", &options.height, "Window height in pixel",
 		"--list-devices", &list, "List information about all available devices",
+#ifdef WITH_CYCLES_LOGGING
+		"--debug", &debug, "Enable debug logging",
+		"--verbose %d", &verbosity, "Set verbosity of the logger",
+#endif
 		"--help", &help, "Print help message",
 		NULL);
 
@@ -355,12 +387,19 @@ static void options_parse(int argc, const char **argv)
 		ap.usage();
 		exit(EXIT_FAILURE);
 	}
-	else if(list) {
+
+	if(debug) {
+		util_logging_start();
+		util_logging_verbosity_set(verbosity);
+	}
+
+	if(list) {
 		vector<DeviceInfo>& devices = Device::available_devices();
 		printf("Devices:\n");
 
 		foreach(DeviceInfo& info, devices) {
-			printf("    %s%s\n",
+			printf("    %-10s%s%s\n",
+				Device::string_from_type(info.type).c_str(),
 				info.description.c_str(),
 				(info.display_device)? " (display)": "");
 		}
@@ -435,6 +474,7 @@ using namespace ccl;
 
 int main(int argc, const char **argv)
 {
+	util_logging_init(argv[0]);
 	path_init();
 	options_parse(argc, argv);
 
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 431796e106b..edea8cd0ec4 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <iterator>
 
+#include "background.h"
 #include "camera.h"
 #include "film.h"
 #include "graph.h"
@@ -55,6 +56,16 @@ struct XMLReadState {
 	string base;		/* base path to current file*/
 	float dicing_rate;	/* current dicing rate */
 	Mesh::DisplacementMethod displacement_method;
+
+	XMLReadState()
+	  : scene(NULL),
+	    smooth(false),
+	    shader(0),
+	    dicing_rate(0.0f),
+	    displacement_method(Mesh::DISPLACE_BUMP)
+	{
+		tfm = transform_identity();
+	}
 };
 
 /* Attribute Reading */
@@ -225,21 +236,21 @@ static ShaderSocketType xml_read_socket_type(pugi::xml_node node, const char *na
 
 	if(attr) {
 		string value = attr.value();
-		if (string_iequals(value, "float"))
+		if(string_iequals(value, "float"))
 			return SHADER_SOCKET_FLOAT;
-		else if (string_iequals(value, "int"))
+		else if(string_iequals(value, "int"))
 			return SHADER_SOCKET_INT;
-		else if (string_iequals(value, "color"))
+		else if(string_iequals(value, "color"))
 			return SHADER_SOCKET_COLOR;
-		else if (string_iequals(value, "vector"))
+		else if(string_iequals(value, "vector"))
 			return SHADER_SOCKET_VECTOR;
-		else if (string_iequals(value, "point"))
+		else if(string_iequals(value, "point"))
 			return SHADER_SOCKET_POINT;
-		else if (string_iequals(value, "normal"))
+		else if(string_iequals(value, "normal"))
 			return SHADER_SOCKET_NORMAL;
-		else if (string_iequals(value, "closure color"))
+		else if(string_iequals(value, "closure color"))
 			return SHADER_SOCKET_CLOSURE;
-		else if (string_iequals(value, "string"))
+		else if(string_iequals(value, "string"))
 			return SHADER_SOCKET_STRING;
 		else
 			fprintf(stderr, "Unknown shader socket type \"%s\" for attribute \"%s\".\n", value.c_str(), name);
@@ -299,7 +310,6 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
 	xml_read_bool(&integrator->transparent_shadows, node, "transparent_shadows");
 	
 	/* Volume */
-	xml_read_int(&integrator->volume_homogeneous_sampling, node, "volume_homogeneous_sampling");
 	xml_read_float(&integrator->volume_step_size, node, "volume_step_size");
 	xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps");
 	
@@ -382,6 +392,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 	for(pugi::xml_node node = graph_node.first_child(); node; node = node.next_sibling()) {
 		ShaderNode *snode = NULL;
 
+		/* ToDo: Add missing nodes
+		 * RGBCurvesNode, VectorCurvesNode, RGBRampNode and ConvertNode (RGB -> BW).
+		 */
+
 		if(string_iequals(node.name(), "image_texture")) {
 			ImageTextureNode *img = new ImageTextureNode();
 
@@ -392,6 +406,8 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_enum(&img->projection, ImageTextureNode::projection_enum, node, "projection");
 			xml_read_float(&img->projection_blend, node, "projection_blend");
 
+			/* ToDo: Interpolation */
+
 			snode = img;
 		}
 		else if(string_iequals(node.name(), "environment_texture")) {
@@ -420,25 +436,25 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			 * Socket names must be stored in the extra lists instead. */
 			/* read input values */
 			for(pugi::xml_node param = node.first_child(); param; param = param.next_sibling()) {
-				if (string_iequals(param.name(), "input")) {
+				if(string_iequals(param.name(), "input")) {
 					string name;
-					if (!xml_read_string(&name, param, "name"))
+					if(!xml_read_string(&name, param, "name"))
 						continue;
 					
 					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if (type == SHADER_SOCKET_UNDEFINED)
+					if(type == SHADER_SOCKET_UNDEFINED)
 						continue;
 					
 					osl->input_names.push_back(ustring(name));
 					osl->add_input(osl->input_names.back().c_str(), type);
 				}
-				else if (string_iequals(param.name(), "output")) {
+				else if(string_iequals(param.name(), "output")) {
 					string name;
-					if (!xml_read_string(&name, param, "name"))
+					if(!xml_read_string(&name, param, "name"))
 						continue;
 					
 					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if (type == SHADER_SOCKET_UNDEFINED)
+					if(type == SHADER_SOCKET_UNDEFINED)
 						continue;
 					
 					osl->output_names.push_back(ustring(name));
@@ -494,10 +510,6 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_int(&magic->depth, node, "depth");
 			snode = magic;
 		}
-		else if(string_iequals(node.name(), "noise_texture")) {
-			NoiseTextureNode *dist = new NoiseTextureNode();
-			snode = dist;
-		}
 		else if(string_iequals(node.name(), "wave_texture")) {
 			WaveTextureNode *wave = new WaveTextureNode();
 			xml_read_enum(&wave->type, WaveTextureNode::type_enum, node, "type");
@@ -508,6 +520,11 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_float3(&normal->direction, node, "direction");
 			snode = normal;
 		}
+		else if(string_iequals(node.name(), "bump")) {
+			BumpNode *bump = new BumpNode();
+			xml_read_bool(&bump->invert, node, "invert");
+			snode = bump;
+		}
 		else if(string_iequals(node.name(), "mapping")) {
 			snode = new MappingNode();
 		}
@@ -562,6 +579,9 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		else if(string_iequals(node.name(), "background")) {
 			snode = new BackgroundNode();
 		}
+		else if(string_iequals(node.name(), "holdout")) {
+			snode = new HoldoutNode();
+		}
 		else if(string_iequals(node.name(), "absorption_volume")) {
 			snode = new AbsorptionVolumeNode();
 		}
@@ -570,7 +590,14 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		}
 		else if(string_iequals(node.name(), "subsurface_scattering")) {
 			SubsurfaceScatteringNode *sss = new SubsurfaceScatteringNode();
-			//xml_read_enum(&sss->falloff, SubsurfaceScatteringNode::falloff_enum, node, "falloff");
+
+			string falloff;
+			xml_read_string(&falloff, node, "falloff");
+			if(falloff == "cubic")
+				sss->closure = CLOSURE_BSSRDF_CUBIC_ID;
+			else
+				sss->closure = CLOSURE_BSSRDF_GAUSSIAN_ID;
+
 			snode = sss;
 		}
 		else if(string_iequals(node.name(), "geometry")) {
@@ -614,6 +641,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			snode = new InvertNode();
 		}
 		else if(string_iequals(node.name(), "mix")) {
+			/* ToDo: Tag Mix case for optimization */
 			MixNode *mix = new MixNode();
 			xml_read_enum(&mix->type, MixNode::type_enum, node, "type");
 			xml_read_bool(&mix->use_clamp, node, "use_clamp");
@@ -638,10 +666,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			snode = new SeparateHSVNode();
 		}
 		else if(string_iequals(node.name(), "combine_xyz")) {
-			snode = new CombineHSVNode();
+			snode = new CombineXYZNode();
 		}
 		else if(string_iequals(node.name(), "separate_xyz")) {
-			snode = new SeparateHSVNode();
+			snode = new SeparateXYZNode();
 		}
 		else if(string_iequals(node.name(), "hsv")) {
 			snode = new HSVNode();
@@ -803,7 +831,17 @@ static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
 	xml_read_string(&shader->name, node, "name");
 	xml_read_bool(&shader->use_mis, node, "use_mis");
 	xml_read_bool(&shader->use_transparent_shadow, node, "use_transparent_shadow");
+
+	/* Volume */
 	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
+	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
+
+	if(xml_equal_string(node, "volume_sampling_method", "distance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
+	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
 
 	xml_read_shader_graph(state, shader, node);
 	state.scene->shaders.push_back(shader);
@@ -813,9 +851,26 @@ static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
 
 static void xml_read_background(const XMLReadState& state, pugi::xml_node node)
 {
+	/* Background Settings */
+	Background *bg = state.scene->background;
+
+	xml_read_float(&bg->ao_distance, node, "ao_distance");
+	xml_read_float(&bg->ao_factor, node, "ao_factor");
+
+	xml_read_bool(&bg->transparent, node, "transparent");
+
+	/* Background Shader */
 	Shader *shader = state.scene->shaders[state.scene->default_background];
 	
 	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
+	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
+
+	if(xml_equal_string(node, "volume_sampling_method", "distance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
+	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
 
 	xml_read_shader_graph(state, shader, node);
 }
@@ -851,6 +906,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 
 	/* read vertices and polygons, RIB style */
 	vector<float3> P;
+	vector<float> UV;
 	vector<int> verts, nverts;
 
 	xml_read_float3_array(P, node, "P");
@@ -922,6 +978,31 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 
 			index_offset += nverts[i];
 		}
+
+		if(xml_read_float_array(UV, node, "UV")) {
+			ustring name = ustring("UVMap");
+			Attribute *attr = mesh->attributes.add(ATTR_STD_UV, name);
+			float3 *fdata = attr->data_float3();
+
+			/* loop over the triangles */
+			index_offset = 0;
+			for(size_t i = 0; i < nverts.size(); i++) {
+				for(int j = 0; j < nverts[i]-2; j++) {
+					int v0 = verts[index_offset];
+					int v1 = verts[index_offset + j + 1];
+					int v2 = verts[index_offset + j + 2];
+
+					assert(v0*2+1 < (int)UV.size());
+					assert(v1*2+1 < (int)UV.size());
+					assert(v2*2+1 < (int)UV.size());
+
+					fdata[0] = make_float3(UV[v0*2], UV[v0*2+1], 0.0);
+					fdata[1] = make_float3(UV[v1*2], UV[v1*2+1], 0.0);
+					fdata[2] = make_float3(UV[v2*2], UV[v2*2+1], 0.0);
+					fdata += 3;
+				}
+			}
+		}
 	}
 
 	/* temporary for test compatibility */
@@ -1011,13 +1092,28 @@ static void xml_read_light(const XMLReadState& state, pugi::xml_node node)
 	xml_read_float(&light->sizev, node, "sizev");
 	xml_read_float3(&light->axisu, node, "axisu");
 	xml_read_float3(&light->axisv, node, "axisv");
-	
+
+	/* Portal? (Area light only) */
+	xml_read_bool(&light->is_portal, node, "is_portal");
+
 	/* Generic */
 	xml_read_float(&light->size, node, "size");
 	xml_read_float3(&light->dir, node, "dir");
 	xml_read_float3(&light->co, node, "P");
 	light->co = transform_point(&state.tfm, light->co);
 
+	/* Settings */
+	xml_read_bool(&light->cast_shadow, node, "cast_shadow");
+	xml_read_bool(&light->use_mis, node, "use_mis");
+	xml_read_int(&light->samples, node, "samples");
+	xml_read_int(&light->max_bounces, node, "max_bounces");
+
+	/* Ray Visibility */
+	xml_read_bool(&light->use_diffuse, node, "use_diffuse");
+	xml_read_bool(&light->use_glossy, node, "use_glossy");
+	xml_read_bool(&light->use_transmission, node, "use_transmission");
+	xml_read_bool(&light->use_scatter, node, "use_scatter");
+
 	state.scene->lights.push_back(light);
 }
 
@@ -1161,7 +1257,8 @@ static void xml_read_include(const XMLReadState& state, const string& src)
 		XMLReadState substate = state;
 		substate.base = path_dirname(path);
 
-		xml_read_scene(substate, doc);
+		pugi::xml_node cycles = doc.child("cycles");
+		xml_read_scene(substate, cycles);
 	}
 	else {
 		fprintf(stderr, "%s read error: %s\n", src.c_str(), parse_result.description());
diff --git a/intern/cycles/app/cycles_xml.h b/intern/cycles/app/cycles_xml.h
index 96bc79c35d8..6a48980d8ea 100644
--- a/intern/cycles/app/cycles_xml.h
+++ b/intern/cycles/app/cycles_xml.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __CYCLES_XML_H__
diff --git a/intern/cycles/app/io_export_cycles_xml.py b/intern/cycles/app/io_export_cycles_xml.py
index ad8fb9d3dd3..7d6d85f88af 100644
--- a/intern/cycles/app/io_export_cycles_xml.py
+++ b/intern/cycles/app/io_export_cycles_xml.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # XML exporter for generating test files, not intended for end users
diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h
index cfd0c3ef264..d3a68c4db4f 100644
--- a/intern/cycles/blender/CCL_api.h
+++ b/intern/cycles/blender/CCL_api.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __CCL_API_H__
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 6529d8186cc..7ca1277904e 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -6,6 +6,7 @@ set(INC
 	../kernel/svm
 	../util
 	../subd
+	../../glew-mx
 	../../guardedalloc
 	../../mikktspace
 	../../../source/blender/makesdna
@@ -16,7 +17,7 @@ set(INC
 
 set(INC_SYS
 	${PYTHON_INCLUDE_DIRS}
-	${GLEW_INCLUDE_PATH}
+	${GLEW_INCLUDE_DIR}
 )
 
 set(SRC
@@ -31,10 +32,12 @@ set(SRC
 	blender_session.cpp
 	blender_shader.cpp
 	blender_sync.cpp
+	blender_texture.cpp
 
 	CCL_api.h
 	blender_sync.h
 	blender_session.h
+	blender_texture.h
 	blender_util.h
 )
 
@@ -49,7 +52,7 @@ set(ADDON_FILES
 	addon/version_update.py
 )
 
-add_definitions(-DGLEW_STATIC)
+add_definitions(${GL_DEFINITIONS})
 
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}")
 
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 415f6e81be8..7a9caa7b06b 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
@@ -49,26 +49,23 @@ class CyclesRender(bpy.types.RenderEngine):
 
     # final render
     def update(self, data, scene):
-        if self.is_preview:
-            if not self.session:
+        if not self.session:
+            if self.is_preview:
                 cscene = bpy.context.scene.cycles
                 use_osl = cscene.shading_system and cscene.device == 'CPU'
 
                 engine.create(self, data, scene,
                               None, None, None, use_osl)
-        else:
-            if not self.session:
-                engine.create(self, data, scene)
             else:
-                engine.reset(self, data, scene)
-
-        engine.update(self, data, scene)
+                engine.create(self, data, scene)
+        else:
+            engine.reset(self, data, scene)
 
     def render(self, scene):
         engine.render(self)
 
-    def bake(self, scene, obj, pass_type, pixel_array, num_pixels, depth, result):
-        engine.bake(self, obj, pass_type, pixel_array, num_pixels, depth, result)
+    def bake(self, scene, obj, pass_type, object_id, pixel_array, num_pixels, depth, result):
+        engine.bake(self, obj, pass_type, object_id, pixel_array, num_pixels, depth, result)
 
     # viewport render
     def view_update(self, context):
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 18235eca790..030f0dbbf14 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -11,21 +11,67 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
 
 
+def _is_using_buggy_driver():
+    import bgl
+    # We need to be conservative here because in multi-GPU systems display card
+    # might be quite old, but others one might be just good.
+    #
+    # So We shouldn't disable possible good dedicated cards just because display
+    # card seems weak. And instead we only blacklist configurations which are
+    # proven to cause problems.
+    if bgl.glGetString(bgl.GL_VENDOR) == "ATI Technologies Inc.":
+        import re
+        version = bgl.glGetString(bgl.GL_VERSION)
+        if version.endswith("Compatibility Profile Context"):
+            # Old HD 4xxx and 5xxx series drivers did not have driver version
+            # in the version string, but those cards do not quite work and
+            # causing crashes.
+            return True
+        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\.[0-9]+)+)$")
+        if not regex.match(version):
+            # Skip cards like FireGL
+            return False
+        version = regex.sub("\\1", version).split('.')
+        return int(version[0]) == 8
+    return False
+
+
+def _workaround_buggy_drivers():
+    if _is_using_buggy_driver():
+        import _cycles
+        if hasattr(_cycles, "opencl_disable"):
+            print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.")
+            _cycles.opencl_disable()
+
+
 def init():
     import bpy
     import _cycles
     import os.path
 
+    # Workaround possibly buggy legacy drivers which crashes on the OpenCL
+    # device enumeration.
+    #
+    # This checks are not really correct because they might still fail
+    # in the case of multiple GPUs. However, currently buggy drivers
+    # are really old and likely to be used in single GPU systems only
+    # anyway.
+    #
+    # Can't do it in the background mode, so we hope OpenCL is no enabled
+    # in the user preferences.
+    if not bpy.app.background:
+        _workaround_buggy_drivers()
+
     path = os.path.dirname(__file__)
     user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', '')))
 
-    _cycles.init(path, user_path)
+    _cycles.init(path, user_path, bpy.app.background)
 
 
 def create(engine, data, scene, region=None, v3d=None, rv3d=None, preview_osl=False):
@@ -59,11 +105,11 @@ def render(engine):
         _cycles.render(engine.session)
 
 
-def bake(engine, obj, pass_type, pixel_array, num_pixels, depth, result):
+def bake(engine, obj, pass_type, object_id, pixel_array, num_pixels, depth, result):
     import _cycles
     session = getattr(engine, "session", None)
     if session is not None:
-        _cycles.bake(engine.session, obj.as_pointer(), pass_type, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
+        _cycles.bake(engine.session, obj.as_pointer(), pass_type, object_id, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
 
 
 def reset(engine, data, scene):
@@ -100,3 +146,8 @@ def with_osl():
 def with_network():
     import _cycles
     return _cycles.with_network
+
+
+def system_info():
+    import _cycles
+    return _cycles.system_info()
diff --git a/intern/cycles/blender/addon/osl.py b/intern/cycles/blender/addon/osl.py
index c5f9d93013e..f4aaaab5eab 100644
--- a/intern/cycles/blender/addon/osl.py
+++ b/intern/cycles/blender/addon/osl.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index 2ec65d7183a..f97b51b629d 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 12c1cd820be..c130594dbf7 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
@@ -59,7 +59,7 @@ enum_filter_types = (
 
 enum_aperture_types = (
     ('RADIUS', "Radius", "Directly change the size of the aperture"),
-    ('FSTOP', "F/stop", "Change the size of the aperture by f/stops"),
+    ('FSTOP', "F-stop", "Change the size of the aperture by f-stop"),
     )
 
 enum_panorama_types = (
@@ -67,6 +67,7 @@ enum_panorama_types = (
     ('FISHEYE_EQUIDISTANT', "Fisheye Equidistant", "Ideal for fulldomes, ignore the sensor dimensions"),
     ('FISHEYE_EQUISOLID', "Fisheye Equisolid",
                           "Similar to most fisheye modern lens, takes sensor dimensions into consideration"),
+    ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
     )
 
 enum_curve_primitives = (
@@ -115,6 +116,11 @@ enum_volume_sampling = (
     ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling for volumes where neither method is ideal"),
     )
 
+enum_volume_interpolation = (
+    ('LINEAR', "Linear", "Good smoothness and speed"),
+    ('CUBIC', "Cubic", "Smoothed high quality interpolation, but slower")
+    )
+
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -346,7 +352,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Distance between volume shader samples when rendering the volume "
                             "(lower values give more accurate and detailed results, but also increased render time)",
                 default=0.1,
-                min=0.0000001, max=100000.0
+                min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0
                 )
 
         cls.volume_max_steps = IntProperty(
@@ -389,6 +395,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=0,
                 )
 
+        cls.use_animated_seed = BoolProperty(
+                name="Use Animated Seed",
+                description="Use different seed values (and hence noise patterns) at different frames",
+                default=False,
+                )
+
         cls.sample_clamp_direct = FloatProperty(
                 name="Clamp Direct",
                 description="If non-zero, the maximum value for a direct sample, "
@@ -452,11 +464,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Use BVH spatial splits: longer builder time, faster render",
                 default=False,
                 )
-        cls.use_cache = BoolProperty(
-                name="Cache BVH",
-                description="Cache last built BVH to disk for faster re-render if no geometry changed",
-                default=False,
-                )
         cls.tile_order = EnumProperty(
                 name="Tile Order",
                 description="Tile order for rendering",
@@ -477,7 +484,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             name="Bake Type",
             default='COMBINED',
             description="Type of pass to bake",
-            items = (
+            items=(
                 ('COMBINED', "Combined", ""),
                 ('AO', "Ambient Occlusion", ""),
                 ('SHADOW', "Shadow", ""),
@@ -500,6 +507,19 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 ),
             )
 
+        cls.use_camera_cull = BoolProperty(
+                name="Use Camera Cull",
+                description="Allow objects to be culled based on the camera frustum",
+                default=False,
+                )
+
+        cls.camera_cull_margin = FloatProperty(
+                name="Camera Cull Margin",
+                description="Margin for the camera space culling",
+                default=0.1,
+                min=0.0, max=5.0
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Scene.cycles
@@ -518,13 +538,13 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):
 
         cls.aperture_type = EnumProperty(
                 name="Aperture Type",
-                description="Use F/stop number or aperture radius",
+                description="Use f-stop number or aperture radius",
                 items=enum_aperture_types,
                 default='RADIUS',
                 )
         cls.aperture_fstop = FloatProperty(
-                name="Aperture F/stop",
-                description="F/stop ratio (lower numbers give more defocus, higher numbers give a sharper image)",
+                name="Aperture f-stop",
+                description="F-stop ratio (lower numbers give more defocus, higher numbers give a sharper image)",
                 min=0.0, soft_min=0.1, soft_max=64.0,
                 default=5.6,
                 step=10,
@@ -578,6 +598,34 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):
                 min=0.01, soft_max=15.0, max=100.0,
                 default=10.5,
                 )
+        cls.latitude_min = FloatProperty(
+                name="Min Latitude",
+                description="Minimum latitude (vertical angle) for the equirectangular lens",
+                min=-0.5 * math.pi, max=0.5 * math.pi,
+                subtype='ANGLE',
+                default=-0.5 * math.pi,
+                )
+        cls.latitude_max = FloatProperty(
+                name="Max Latitude",
+                description="Maximum latitude (vertical angle) for the equirectangular lens",
+                min=-0.5 * math.pi, max=0.5 * math.pi,
+                subtype='ANGLE',
+                default=0.5 * math.pi,
+                )
+        cls.longitude_min = FloatProperty(
+                name="Min Longitude",
+                description="Minimum longitude (horizontal angle) for the equirectangular lens",
+                min=-math.pi, max=math.pi,
+                subtype='ANGLE',
+                default=-math.pi,
+                )
+        cls.longitude_max = FloatProperty(
+                name="Max Longitude",
+                description="Maximum longitude (horizontal angle) for the equirectangular lens",
+                min=-math.pi, max=math.pi,
+                subtype='ANGLE',
+                default=math.pi,
+                )
         cls.nodes = StringProperty(
                 name="nodes",
                 description="Camera ray nodes")
@@ -621,6 +669,13 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
                 default='DISTANCE',
                 )
 
+        cls.volume_interpolation = EnumProperty(
+                name="Volume Interpolation",
+                description="Interpolation method to use for smoke/fire volumes",
+                items=enum_volume_interpolation,
+                default='LINEAR',
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Material.cycles
@@ -645,12 +700,24 @@ class CyclesLampSettings(bpy.types.PropertyGroup):
                 min=1, max=10000,
                 default=1,
                 )
+        cls.max_bounces = IntProperty(
+                name="Max Bounces",
+                description="Maximum number of bounces the light will contribute to the render",
+                min=0, max=1024,
+                default=1024,
+                )
         cls.use_multiple_importance_sampling = BoolProperty(
                 name="Multiple Importance Sample",
                 description="Use multiple importance sampling for the lamp, "
                             "reduces noise for area lamps and sharp glossy materials",
                 default=False,
                 )
+        cls.is_portal = BoolProperty(
+                name="Is Portal",
+                description="Use this area lamp to guide sampling of the background, "
+                            "note that this will make the lamp invisible",
+                default=False,
+                )
 
     @classmethod
     def unregister(cls):
@@ -675,7 +742,7 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
                 name="Map Resolution",
                 description="Importance map size is resolution x resolution; "
                             "higher values potentially produce less noise, at the cost of memory and speed",
-                min=4, max=8096,
+                min=4, max=8192,
                 default=256,
                 )
         cls.samples = IntProperty(
@@ -684,6 +751,12 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
                 min=1, max=10000,
                 default=4,
                 )
+        cls.max_bounces = IntProperty(
+                name="Max Bounces",
+                description="Maximum number of bounces the background light will contribute to the render",
+                min=0, max=1024,
+                default=1024,
+                )
         cls.homogeneous_volume = BoolProperty(
                 name="Homogeneous Volume",
                 description="When using volume rendering, assume volume has the same density everywhere"
@@ -697,6 +770,13 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
                 default='EQUIANGULAR',
                 )
 
+        cls.volume_interpolation = EnumProperty(
+                name="Volume Interpolation",
+                description="Interpolation method to use for volumes",
+                items=enum_volume_interpolation,
+                default='LINEAR',
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.World.cycles
@@ -828,6 +908,12 @@ class CyclesObjectBlurSettings(bpy.types.PropertyGroup):
                 default=1,
                 )
 
+        cls.use_camera_cull = BoolProperty(
+                name="Use Camera Cull",
+                description="Allow this object and its duplicators to be culled by camera space culling",
+                default=False,
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Object.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index b544c476c11..a44a0b46960 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -11,14 +11,18 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
 
 import bpy
 
-from bpy.types import Panel, Menu, Operator
+from bpy.types import (
+        Panel,
+        Menu,
+        Operator,
+        )
 
 
 class CYCLES_MT_sampling_presets(Menu):
@@ -37,7 +41,7 @@ class CYCLES_MT_integrator_presets(Menu):
     draw = Menu.draw_preset
 
 
-class CyclesButtonsPanel():
+class CyclesButtonsPanel:
     bl_space_type = "PROPERTIES"
     bl_region_type = "WINDOW"
     bl_context = "render"
@@ -56,7 +60,15 @@ def use_cpu(context):
     return (device_type == 'NONE' or cscene.device == 'CPU')
 
 
-def draw_samples_info(layout, cscene):
+def use_branched_path(context):
+    cscene = context.scene.cycles
+    device_type = context.user_preferences.system.compute_device_type
+
+    return (cscene.progressive == 'BRANCHED_PATH' and device_type != 'OPENCL')
+
+
+def draw_samples_info(layout, context):
+    cscene = context.scene.cycles
     integrator = cscene.progressive
 
     # Calculate sample values
@@ -86,7 +98,7 @@ def draw_samples_info(layout, cscene):
 
     # Draw interface
     # Do not draw for progressive, when Square Samples are disabled
-    if (integrator == 'BRANCHED_PATH') or (cscene.use_square_samples and integrator == 'PATH'):
+    if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
         col = layout.column(align=True)
         col.scale_y = 0.6
         col.label("Total Samples:")
@@ -110,6 +122,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         scene = context.scene
         cscene = scene.cycles
+        device_type = context.user_preferences.system.compute_device_type
 
         row = layout.row(align=True)
         row.menu("CYCLES_MT_sampling_presets", text=bpy.types.CYCLES_MT_sampling_presets.bl_label)
@@ -117,7 +130,9 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         row.operator("render.cycles_sampling_preset_add", text="", icon="ZOOMOUT").remove_active = True
 
         row = layout.row()
-        row.prop(cscene, "progressive", text="")
+        sub = row.row()
+        sub.active = device_type != 'OPENCL'
+        sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
         split = layout.split()
@@ -125,11 +140,15 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         col = split.column()
         sub = col.column(align=True)
         sub.label("Settings:")
-        sub.prop(cscene, "seed")
+
+        seed_sub = sub.row(align=True)
+        seed_sub.prop(cscene, "seed")
+        seed_sub.prop(cscene, "use_animated_seed", text="", icon="TIME")
+
         sub.prop(cscene, "sample_clamp_direct")
         sub.prop(cscene, "sample_clamp_indirect")
 
-        if cscene.progressive == 'PATH':
+        if cscene.progressive == 'PATH' or use_branched_path(context) == False:
             col = split.column()
             sub = col.column(align=True)
             sub.label(text="Samples:")
@@ -163,7 +182,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
                 layout.row().prop(cscene, "use_layer_samples")
                 break
 
-        draw_samples_info(layout, cscene)
+        draw_samples_info(layout, context)
 
 
 class CyclesRender_PT_volume_sampling(CyclesButtonsPanel, Panel):
@@ -313,7 +332,6 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         col.separator()
 
         col.label(text="Final Render:")
-        col.prop(cscene, "use_cache")
         col.prop(rd, "use_persistent_data", text="Persistent Images")
 
         col.separator()
@@ -412,6 +430,52 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
+        if hasattr(rd, "debug_pass_type"):
+            layout.prop(rd, "debug_pass_type")
+
+
+class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
+    bl_label = "Views"
+    bl_context = "render_layer"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        self.layout.prop(rd, "use_multiview", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        rd = scene.render
+        rv = rd.views.active
+
+        layout.active = rd.use_multiview
+        basic_stereo = (rd.views_format == 'STEREO_3D')
+
+        row = layout.row()
+        row.prop(rd, "views_format", expand=True)
+
+        if basic_stereo:
+            row = layout.row()
+            row.template_list("RENDERLAYER_UL_renderviews", "name", rd, "stereo_views", rd.views, "active_index", rows=2)
+
+            row = layout.row()
+            row.label(text="File Suffix:")
+            row.prop(rv, "file_suffix", text="")
+
+        else:
+            row = layout.row()
+            row.template_list("RENDERLAYER_UL_renderviews", "name", rd, "views", rd.views, "active_index", rows=2)
+
+            col = row.column(align=True)
+            col.operator("scene.render_view_add", icon='ZOOMIN', text="")
+            col.operator("scene.render_view_remove", icon='ZOOMOUT', text="")
+
+            row = layout.row()
+            row.label(text="Camera Suffix:")
+            row.prop(rv, "camera_suffix", text="")
+
 
 class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
@@ -445,6 +509,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
 
         cam = context.camera
         ccam = cam.cycles
+        dof_options = cam.gpu_dof
 
         split = layout.split()
 
@@ -456,6 +521,16 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
         sub.active = cam.dof_object is None
         sub.prop(cam, "dof_distance", text="Distance")
 
+        hq_support = dof_options.is_hq_supported
+        sub = col.column(align=True)
+        sub.label("Viewport:")
+        subhq = sub.column()
+        subhq.active = hq_support
+        subhq.prop(dof_options, "use_high_quality")
+        sub.prop(dof_options, "fstop")
+        if dof_options.use_high_quality and hq_support:
+            sub.prop(dof_options, "blades")
+
         col = split.column()
 
         col.label("Aperture:")
@@ -506,11 +581,16 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
         ob = context.object
         slot = context.material_slot
         space = context.space_data
+        is_sortable = len(ob.material_slots) > 1
 
         if ob:
+            rows = 1
+            if (is_sortable):
+                rows = 4
+
             row = layout.row()
 
-            row.template_list("MATERIAL_UL_matslots", "", ob, "material_slots", ob, "active_material_index", rows=1)
+            row.template_list("MATERIAL_UL_matslots", "", ob, "material_slots", ob, "active_material_index", rows=rows)
 
             col = row.column(align=True)
             col.operator("object.material_slot_add", icon='ZOOMIN', text="")
@@ -518,6 +598,12 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
 
             col.menu("MATERIAL_MT_specials", icon='DOWNARROW_HLT', text="")
 
+            if is_sortable:
+                col.separator()
+
+                col.operator("object.material_slot_move", icon='TRIA_UP', text="").direction = 'UP'
+                col.operator("object.material_slot_move", icon='TRIA_DOWN', text="").direction = 'DOWN'
+
             if ob.mode == 'EDIT':
                 row = layout.row(align=True)
                 row.operator("object.material_slot_assign", text="Assign")
@@ -579,7 +665,13 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         ob = context.object
-        return CyclesButtonsPanel.poll(context) and ob and ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}
+        if CyclesButtonsPanel.poll(context) and ob:
+            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}:
+                return True
+            if ob.dupli_type == 'GROUP' and ob.dupli_group:
+                return True
+            # TODO(sergey): More duplicator types here?
+        return False
 
     def draw_header(self, context):
         layout = self.layout
@@ -613,8 +705,8 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
         sub.prop(cob, "motion_steps", text="Steps")
 
 
-class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
-    bl_label = "Ray Visibility"
+class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
+    bl_label = "Cycles Settings"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -622,15 +714,19 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         ob = context.object
         return (CyclesButtonsPanel.poll(context) and
-                ob and ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'} or
-                ob and ob.dupli_type == 'GROUP' and ob.dupli_group)
+                ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'}) or
+                        (ob.dupli_type == 'GROUP' and ob.dupli_group)))
 
     def draw(self, context):
         layout = self.layout
 
+        scene = context.scene
+        cscene = scene.cycles
         ob = context.object
+        cob = ob.cycles
         visibility = ob.cycles_visibility
 
+        layout.label(text="Ray Visibility:")
         flow = layout.column_flow()
 
         flow.prop(visibility, "camera")
@@ -642,6 +738,12 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
         if ob.type != 'LAMP':
             flow.prop(visibility, "shadow")
 
+        col = layout.column()
+        col.label(text="Performance:")
+        row = col.row()
+        row.active = scene.render.use_simplify and cscene.use_camera_cull
+        row.prop(cob, "use_camera_cull")
+
 
 class CYCLES_OT_use_shading_nodes(Operator):
     """Enable nodes on a material, world or lamp"""
@@ -668,9 +770,14 @@ def find_node(material, nodetype):
     if material and material.node_tree:
         ntree = material.node_tree
 
+        active_output_node = None
         for node in ntree.nodes:
             if getattr(node, "type", None) == nodetype:
-                return node
+                if getattr(node, "is_active_output", True):
+                    return node
+                if not active_output_node:
+                    active_output_node = node
+        return active_output_node
 
     return None
 
@@ -707,7 +814,10 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return context.lamp and CyclesButtonsPanel.poll(context)
+        return context.lamp and \
+               not (context.lamp.type == 'AREA' and
+                    context.lamp.cycles.is_portal) \
+               and CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         self.layout.template_preview(context.lamp)
@@ -745,13 +855,21 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
                 sub.prop(lamp, "size", text="Size X")
                 sub.prop(lamp, "size_y", text="Size Y")
 
-        if cscene.progressive == 'BRANCHED_PATH':
-            col.prop(clamp, "samples")
+        if not (lamp.type == 'AREA' and clamp.is_portal):
+            sub = col.column(align=True)
+            if use_branched_path(context):
+                sub.prop(clamp, "samples")
+            sub.prop(clamp, "max_bounces")
 
         col = split.column()
-        col.prop(clamp, "cast_shadow")
 
-        layout.prop(clamp, "use_multiple_importance_sampling")
+        sub = col.column(align=True)
+        sub.active = not (lamp.type == 'AREA' and clamp.is_portal)
+        sub.prop(clamp, "cast_shadow")
+        sub.prop(clamp, "use_multiple_importance_sampling", text="Multiple Importance")
+
+        if lamp.type == 'AREA':
+            col.prop(clamp, "is_portal", text="Portal")
 
         if lamp.type == 'HEMI':
             layout.label(text="Not supported, interpreted as sun lamp")
@@ -763,7 +881,9 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return context.lamp and CyclesButtonsPanel.poll(context)
+        return context.lamp and not (context.lamp.type == 'AREA' and
+                                     context.lamp.cycles.is_portal) and \
+               CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         layout = self.layout
@@ -946,14 +1066,16 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.active = cworld.sample_as_light
         sub.prop(cworld, "sample_map_resolution")
-        if cscene.progressive == 'BRANCHED_PATH':
+        if use_branched_path(context):
             sub.prop(cworld, "samples")
+        sub.prop(cworld, "max_bounces")
 
         col = split.column()
         col.label(text="Volume:")
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cworld, "volume_sampling", text="")
+        sub.prop(cworld, "volume_interpolation", text="")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
 
 
@@ -1037,17 +1159,6 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         cmat = mat.cycles
 
         split = layout.split()
-
-        col = split.column(align=True)
-        col.prop(mat, "diffuse_color", text="Viewport Color")
-        col.prop(mat, "alpha")
-
-        col = split.column(align=True)
-        col.label()
-        col.prop(mat, "pass_index")
-
-        split = layout.split()
-
         col = split.column()
         col.label(text="Surface:")
         col.prop(cmat, "sample_as_light", text="Multiple Importance")
@@ -1058,8 +1169,29 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cmat, "volume_sampling", text="")
+        sub.prop(cmat, "volume_interpolation", text="")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
 
+        layout.separator()
+        split = layout.split()
+
+        col = split.column(align=True)
+        col.label("Viewport Color:")
+        col.prop(mat, "diffuse_color", text="")
+        col.prop(mat, "alpha")
+
+        col.separator()
+        col.label("Viewport Alpha:")
+        col.prop(mat.game_settings, "alpha_blend", text="")
+
+        col = split.column(align=True)
+        col.label("Viewport Specular:")
+        col.prop(mat, "specular_color", text="")
+        col.prop(mat, "specular_hardness", text="Hardness")
+
+        col.separator()
+        col.prop(mat, "pass_index")
+
 
 class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
     bl_label = ""
@@ -1126,7 +1258,8 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         node = context.texture_node
-        return node and CyclesButtonsPanel.poll(context)
+        # TODO(sergey): perform a faster/nicer check?
+        return node and hasattr(node, 'texture_mapping') and CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         layout = self.layout
@@ -1350,13 +1483,28 @@ class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
-        rd = context.scene.render
+        scene = context.scene
+        rd = scene.render
+        cscene = scene.cycles
 
         layout.active = rd.use_simplify
+        split = layout.split()
 
-        row = layout.row()
-        row.prop(rd, "simplify_subdivision", text="Subdivision")
-        row.prop(rd, "simplify_child_particles", text="Child Particles")
+        col = split.column()
+        col.label(text="Viewport:")
+        col.prop(rd, "simplify_subdivision", text="Subdivision")
+        col.prop(rd, "simplify_child_particles", text="Child Particles")
+
+        col = split.column()
+        col.label(text="Render:")
+        col.prop(rd, "simplify_subdivision_render", text="Subdivision")
+        col.prop(rd, "simplify_child_particles_render", text="Child Particles")
+
+        col = layout.column()
+        col.prop(cscene, "use_camera_cull")
+        subsub = col.column()
+        subsub.active = cscene.use_camera_cull
+        subsub.prop(cscene, "camera_cull_margin")
 
 
 def draw_device(self, context):
@@ -1399,7 +1547,11 @@ def get_panels():
         "RENDER_PT_encoding",
         "RENDER_PT_dimensions",
         "RENDER_PT_stamp",
+        "RENDER_PT_freestyle",
         "RENDERLAYER_PT_layers",
+        "RENDERLAYER_PT_freestyle",
+        "RENDERLAYER_PT_freestyle_lineset",
+        "RENDERLAYER_PT_freestyle_linestyle",
         "SCENE_PT_scene",
         "SCENE_PT_color_management",
         "SCENE_PT_custom_props",
@@ -1424,6 +1576,8 @@ def get_panels():
         "DATA_PT_vertex_colors",
         "DATA_PT_camera",
         "DATA_PT_camera_display",
+        "DATA_PT_camera_stereoscopy",
+        "DATA_PT_camera_safe_areas",
         "DATA_PT_lens",
         "DATA_PT_speaker",
         "DATA_PT_distance",
@@ -1437,6 +1591,7 @@ def get_panels():
         "DATA_PT_custom_props_curve",
         "DATA_PT_custom_props_lattice",
         "DATA_PT_custom_props_metaball",
+        "TEXTURE_PT_preview",
         "TEXTURE_PT_custom_props",
         "TEXTURE_PT_clouds",
         "TEXTURE_PT_wood",
@@ -1454,6 +1609,7 @@ def get_panels():
         "TEXTURE_PT_pointdensity",
         "TEXTURE_PT_pointdensity_turbulence",
         "TEXTURE_PT_mapping",
+        "TEXTURE_PT_ocean",
         "TEXTURE_PT_influence",
         "TEXTURE_PT_colors",
         "PARTICLE_PT_context_particles",
@@ -1475,6 +1631,7 @@ def get_panels():
         "PARTICLE_PT_force_fields",
         "PARTICLE_PT_vertexgroups",
         "MATERIAL_PT_custom_props",
+        "MATERIAL_PT_freestyle_line",
         "BONE_PT_custom_props",
         "OBJECT_PT_custom_props",
         ]
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index eaeec703ff5..2fbb01ba5b8 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -11,7 +11,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 #
 
 # <pep8 compliant>
@@ -21,6 +21,85 @@ import bpy
 from bpy.app.handlers import persistent
 
 
+def check_is_new_shading_ntree(node_tree):
+    for node in node_tree.nodes:
+        # If material has any node with ONLY new shading system
+        # compatibility then it's considered a Cycles material
+        # and versioning code would need to perform on it.
+        #
+        # We can not check for whether NEW_SHADING in compatibility
+        # because some nodes could have compatibility with both old
+        # and new shading system and they can't be used for any
+        # decision here.
+        if node.shading_compatibility == {'NEW_SHADING'}:
+            return True
+
+        # If node is only compatible with old shading system
+        # then material can not be Cycles material and we
+        # can stopiterating nodes now.
+        if node.shading_compatibility == {'OLD_SHADING'}:
+            return False
+    return False
+
+
+def check_is_new_shading_material(material):
+    if not material.node_tree:
+        return False
+    return check_is_new_shading_ntree(material.node_tree)
+
+
+def check_is_new_shading_world(world):
+    if not world.node_tree:
+        return False
+    return check_is_new_shading_ntree(world.node_tree)
+
+
+def check_is_new_shading_lamp(lamp):
+    if not lamp.node_tree:
+        return False
+    return check_is_new_shading_ntree(lamp.node_tree)
+
+
+def foreach_notree_node(nodetree, callback, traversed):
+    if nodetree in traversed:
+        return
+    traversed.add(nodetree)
+    for node in nodetree.nodes:
+        callback(node)
+        if node.bl_idname == 'ShaderNodeGroup':
+            foreach_notree_node(node.node_tree, callback, traversed)
+
+
+def foreach_cycles_node(callback):
+    traversed = set()
+    for material in bpy.data.materials:
+        if check_is_new_shading_material(material):
+                foreach_notree_node(material.node_tree,
+                                    callback,
+                                    traversed)
+    for world in bpy.data.worlds:
+        if check_is_new_shading_world(world):
+                foreach_notree_node(world.node_tree,
+                                    callback,
+                                    traversed)
+    for lamp in bpy.data.lamps:
+        if check_is_new_shading_world(lamp):
+                foreach_notree_node(lamp.node_tree,
+                                    callback,
+                                    traversed)
+
+
+def mapping_node_order_flip(node):
+    """
+    Flip euler order of mapping shader node
+    """
+    if node.bl_idname == 'ShaderNodeMapping':
+        rot = node.rotation.copy()
+        rot.order = 'ZYX'
+        quat = rot.to_quaternion()
+        node.rotation = quat.to_euler('XYZ')
+
+
 @persistent
 def do_versions(self):
     # We don't modify startup file because it assumes to
@@ -57,3 +136,7 @@ def do_versions(self):
 
                 cscene.caustics_reflective = False
                 cscene.caustics_refractive = False
+
+    # Euler order was ZYX in previous versions.
+    if bpy.data.version <= (2, 73, 4):
+        foreach_cycles_node(mapping_node_order_flip)
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index d26317b6e32..d45dc62a1e0 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -20,6 +20,8 @@
 #include "blender_sync.h"
 #include "blender_util.h"
 
+#include "util_logging.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Blender Camera Intermediate: we first convert both the offline and 3d view
@@ -51,6 +53,10 @@ struct BlenderCamera {
 	PanoramaType panorama_type;
 	float fisheye_fov;
 	float fisheye_lens;
+	float latitude_min;
+	float latitude_max;
+	float longitude_min;
+	float longitude_max;
 
 	enum { AUTO, HORIZONTAL, VERTICAL } sensor_fit;
 	float sensor_width;
@@ -61,11 +67,12 @@ struct BlenderCamera {
 
 	BoundBox2D border;
 	BoundBox2D pano_viewplane;
+	BoundBox2D viewport_camera_border;
 
 	Transform matrix;
 };
 
-static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render, BL::Scene b_scene)
+static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render)
 {
 	memset(bcam, 0, sizeof(BlenderCamera));
 
@@ -80,13 +87,15 @@ static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render
 	bcam->border.top = 1.0f;
 	bcam->pano_viewplane.right = 1.0f;
 	bcam->pano_viewplane.top = 1.0f;
+	bcam->viewport_camera_border.right = 1.0f;
+	bcam->viewport_camera_border.top = 1.0f;
 
 	/* render resolution */
 	bcam->full_width = render_resolution_x(b_render);
 	bcam->full_height = render_resolution_y(b_render);
 }
 
-static float blender_camera_focal_distance(BL::Object b_ob, BL::Camera b_camera)
+static float blender_camera_focal_distance(BL::RenderEngine b_engine, BL::Object b_ob, BL::Camera b_camera)
 {
 	BL::Object b_dof_object = b_camera.dof_object();
 
@@ -94,14 +103,16 @@ static float blender_camera_focal_distance(BL::Object b_ob, BL::Camera b_camera)
 		return b_camera.dof_distance();
 	
 	/* for dof object, return distance along camera Z direction */
-	Transform obmat = transform_clear_scale(get_transform(b_ob.matrix_world()));
+	BL::Array<float, 16> b_ob_matrix;
+	b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+	Transform obmat = get_transform(b_ob_matrix);
 	Transform dofmat = get_transform(b_dof_object.matrix_world());
-	Transform mat = transform_inverse(obmat) * dofmat;
-
-	return fabsf(transform_get_column(&mat, 3).z);
+	float3 view_dir = normalize(transform_get_column(&obmat, 2));
+	float3 dof_dir = transform_get_column(&obmat, 3) - transform_get_column(&dofmat, 3);
+	return fabsf(dot(view_dir, dof_dir));
 }
 
-static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, bool skip_panorama = false)
+static void blender_camera_from_object(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::Object b_ob, bool skip_panorama = false)
 {
 	BL::ID b_ob_data = b_ob.data();
 
@@ -137,6 +148,9 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 			case 2:
 				bcam->panorama_type = PANORAMA_FISHEYE_EQUISOLID;
 				break;
+			case 3:
+				bcam->panorama_type = PANORAMA_MIRRORBALL;
+				break;
 			case 0:
 			default:
 				bcam->panorama_type = PANORAMA_EQUIRECTANGULAR;
@@ -145,6 +159,10 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 
 		bcam->fisheye_fov = RNA_float_get(&ccamera, "fisheye_fov");
 		bcam->fisheye_lens = RNA_float_get(&ccamera, "fisheye_lens");
+		bcam->latitude_min = RNA_float_get(&ccamera, "latitude_min");
+		bcam->latitude_max = RNA_float_get(&ccamera, "latitude_max");
+		bcam->longitude_min = RNA_float_get(&ccamera, "longitude_min");
+		bcam->longitude_max = RNA_float_get(&ccamera, "longitude_max");
 
 		bcam->ortho_scale = b_camera.ortho_scale();
 
@@ -168,10 +186,10 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 
 		bcam->apertureblades = RNA_int_get(&ccamera, "aperture_blades");
 		bcam->aperturerotation = RNA_float_get(&ccamera, "aperture_rotation");
-		bcam->focaldistance = blender_camera_focal_distance(b_ob, b_camera);
+		bcam->focaldistance = blender_camera_focal_distance(b_engine, b_ob, b_camera);
 		bcam->aperture_ratio = RNA_float_get(&ccamera, "aperture_ratio");
 
-		bcam->shift.x = b_camera.shift_x();
+		bcam->shift.x = b_engine.camera_shift_x(b_ob);
 		bcam->shift.y = b_camera.shift_y();
 
 		bcam->sensor_width = b_camera.sensor_width();
@@ -189,19 +207,34 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 	}
 }
 
-static Transform blender_camera_matrix(const Transform& tfm, CameraType type)
+static Transform blender_camera_matrix(const Transform& tfm,
+                                       const CameraType type,
+                                       const PanoramaType panorama_type)
 {
 	Transform result;
 
 	if(type == CAMERA_PANORAMA) {
-		/* make it so environment camera needs to be pointed in the direction
-		 * of the positive x-axis to match an environment texture, this way
-		 * it is looking at the center of the texture */
-		result = tfm *
-			make_transform( 0.0f, -1.0f, 0.0f, 0.0f,
-			                0.0f,  0.0f, 1.0f, 0.0f,
-			               -1.0f,  0.0f, 0.0f, 0.0f,
-			                0.0f,  0.0f, 0.0f, 1.0f);
+		if(panorama_type == PANORAMA_MIRRORBALL) {
+			/* Mirror ball camera is looking into the negative Y direction
+			 * which matches texture mirror ball mapping.
+			 */
+			result = tfm *
+				make_transform(1.0f, 0.0f, 0.0f, 0.0f,
+				               0.0f, 0.0f, 1.0f, 0.0f,
+				               0.0f, 1.0f, 0.0f, 0.0f,
+				               0.0f, 0.0f, 0.0f, 1.0f);
+		}
+		else {
+			/* Make it so environment camera needs to be pointed in the direction
+			 * of the positive x-axis to match an environment texture, this way
+			 * it is looking at the center of the texture
+			 */
+			result = tfm *
+				make_transform( 0.0f, -1.0f, 0.0f, 0.0f,
+				                0.0f,  0.0f, 1.0f, 0.0f,
+				               -1.0f,  0.0f, 0.0f, 0.0f,
+				                0.0f,  0.0f, 0.0f, 1.0f);
+		}
 	}
 	else {
 		/* note the blender camera points along the negative z-axis */
@@ -211,8 +244,11 @@ static Transform blender_camera_matrix(const Transform& tfm, CameraType type)
 	return transform_clear_scale(result);
 }
 
-static void blender_camera_viewplane(BlenderCamera *bcam, int width, int height,
-	BoundBox2D *viewplane, float *aspectratio, float *sensor_size)
+static void blender_camera_viewplane(BlenderCamera *bcam,
+                                     int width, int height,
+                                     BoundBox2D *viewplane,
+                                     float *aspectratio,
+                                     float *sensor_size)
 {
 	/* dimensions */
 	float xratio = (float)width*bcam->pixelaspect.x;
@@ -225,24 +261,34 @@ static void blender_camera_viewplane(BlenderCamera *bcam, int width, int height,
 	/* sensor fitting */
 	if(bcam->sensor_fit == BlenderCamera::AUTO) {
 		horizontal_fit = (xratio > yratio);
-		*sensor_size = bcam->sensor_width;
+		if(sensor_size != NULL) {
+			*sensor_size = bcam->sensor_width;
+		}
 	}
 	else if(bcam->sensor_fit == BlenderCamera::HORIZONTAL) {
 		horizontal_fit = true;
-		*sensor_size = bcam->sensor_width;
+		if(sensor_size != NULL) {
+			*sensor_size = bcam->sensor_width;
+		}
 	}
 	else {
 		horizontal_fit = false;
-		*sensor_size = bcam->sensor_height;
+		if(sensor_size != NULL) {
+			*sensor_size = bcam->sensor_height;
+		}
 	}
 
 	if(horizontal_fit) {
-		*aspectratio = xratio/yratio;
+		if(aspectratio != NULL) {
+			*aspectratio = xratio/yratio;
+		}
 		xaspect = *aspectratio;
 		yaspect = 1.0f;
 	}
 	else {
-		*aspectratio = yratio/xratio;
+		if(aspectratio != NULL) {
+			*aspectratio = yratio/xratio;
+		}
 		xaspect = 1.0f;
 		yaspect = *aspectratio;
 	}
@@ -251,31 +297,37 @@ static void blender_camera_viewplane(BlenderCamera *bcam, int width, int height,
 	if(bcam->type == CAMERA_ORTHOGRAPHIC) {
 		xaspect = xaspect*bcam->ortho_scale/(*aspectratio*2.0f);
 		yaspect = yaspect*bcam->ortho_scale/(*aspectratio*2.0f);
-		*aspectratio = bcam->ortho_scale/2.0f;
+		if(aspectratio != NULL) {
+			*aspectratio = bcam->ortho_scale/2.0f;
+		}
 	}
 
 	if(bcam->type == CAMERA_PANORAMA) {
 		/* set viewplane */
-		*viewplane = bcam->pano_viewplane;
+		if(viewplane != NULL) {
+			*viewplane = bcam->pano_viewplane;
+		}
 	}
 	else {
 		/* set viewplane */
-		viewplane->left = -xaspect;
-		viewplane->right = xaspect;
-		viewplane->bottom = -yaspect;
-		viewplane->top = yaspect;
-
-		/* zoom for 3d camera view */
-		*viewplane = (*viewplane) * bcam->zoom;
-
-		/* modify viewplane with camera shift and 3d camera view offset */
-		float dx = 2.0f*(*aspectratio*bcam->shift.x + bcam->offset.x*xaspect*2.0f);
-		float dy = 2.0f*(*aspectratio*bcam->shift.y + bcam->offset.y*yaspect*2.0f);
-
-		viewplane->left += dx;
-		viewplane->right += dx;
-		viewplane->bottom += dy;
-		viewplane->top += dy;
+		if(viewplane != NULL) {
+			viewplane->left = -xaspect;
+			viewplane->right = xaspect;
+			viewplane->bottom = -yaspect;
+			viewplane->top = yaspect;
+
+			/* zoom for 3d camera view */
+			*viewplane = (*viewplane) * bcam->zoom;
+
+			/* modify viewplane with camera shift and 3d camera view offset */
+			float dx = 2.0f*(*aspectratio*bcam->shift.x + bcam->offset.x*xaspect*2.0f);
+			float dy = 2.0f*(*aspectratio*bcam->shift.y + bcam->offset.y*yaspect*2.0f);
+
+			viewplane->left += dx;
+			viewplane->right += dx;
+			viewplane->bottom += dy;
+			viewplane->top += dy;
+		}
 	}
 }
 
@@ -292,7 +344,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 		&cam->viewplane, &aspectratio, &sensor_size);
 
 	/* panorama sensor */
-	if (bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
+	if(bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
 		float fit_xratio = (float)bcam->full_width*bcam->pixelaspect.x;
 		float fit_yratio = (float)bcam->full_height*bcam->pixelaspect.y;
 		bool horizontal_fit;
@@ -336,6 +388,11 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->panorama_type = bcam->panorama_type;
 	cam->fisheye_fov = bcam->fisheye_fov;
 	cam->fisheye_lens = bcam->fisheye_lens;
+	cam->latitude_min = bcam->latitude_min;
+	cam->latitude_max = bcam->latitude_max;
+
+	cam->longitude_min = bcam->longitude_min;
+	cam->longitude_max = bcam->longitude_max;
 
 	/* anamorphic lens bokeh */
 	cam->aperture_ratio = bcam->aperture_ratio;
@@ -348,14 +405,20 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->bladesrotation = bcam->aperturerotation;
 
 	/* transform */
-	cam->matrix = blender_camera_matrix(bcam->matrix, bcam->type);
+	cam->matrix = blender_camera_matrix(bcam->matrix,
+	                                    bcam->type,
+	                                    bcam->panorama_type);
 	cam->motion.pre = cam->matrix;
 	cam->motion.post = cam->matrix;
 	cam->use_motion = false;
+	cam->use_perspective_motion = false;
 	cam->shuttertime = bcam->shuttertime;
+	cam->fov_pre = cam->fov;
+	cam->fov_post = cam->fov;
 
 	/* border */
 	cam->border = bcam->border;
+	cam->viewport_camera_border = bcam->viewport_camera_border;
 
 	/* set update flag */
 	if(cam->modified(prevcam))
@@ -367,7 +430,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override, int width, int height)
 {
 	BlenderCamera bcam;
-	blender_camera_init(&bcam, b_render, b_scene);
+	blender_camera_init(&bcam, b_render);
 
 	/* pixel aspect */
 	bcam.pixelaspect.x = b_render.pixel_aspect_x();
@@ -389,25 +452,34 @@ void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override
 		b_ob = b_override;
 
 	if(b_ob) {
-		blender_camera_from_object(&bcam, b_ob);
-		bcam.matrix = get_transform(b_ob.matrix_world());
+		BL::Array<float, 16> b_ob_matrix;
+		blender_camera_from_object(&bcam, b_engine, b_ob);
+		b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+		bcam.matrix = get_transform(b_ob_matrix);
 	}
 
 	/* sync */
 	Camera *cam = scene->camera;
 	blender_camera_sync(cam, &bcam, width, height);
 	sync_camera_nodes(b_ob);
-	scene->camera->use_camera_in_volume = experimental;
 }
 
-void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
+void BlenderSync::sync_camera_motion(BL::RenderSettings b_render,
+                                     BL::Object b_ob,
+                                     int width, int height,
+                                     float motion_time)
 {
-	Camera *cam = scene->camera;
+	if(!b_ob)
+		return;
 
-	Transform tfm = get_transform(b_ob.matrix_world());
-	tfm = blender_camera_matrix(tfm, cam->type);
+	Camera *cam = scene->camera;
+	BL::Array<float, 16> b_ob_matrix;
+	b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+	Transform tfm = get_transform(b_ob_matrix);
+	tfm = blender_camera_matrix(tfm, cam->type, cam->panorama_type);
 
 	if(tfm != cam->matrix) {
+		VLOG(1) << "Camera " << b_ob.name() << " motion detected.";
 		if(motion_time == -1.0f) {
 			cam->motion.pre = tfm;
 			cam->use_motion = true;
@@ -417,14 +489,39 @@ void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
 			cam->use_motion = true;
 		}
 	}
+
+	if(cam->type == CAMERA_PERSPECTIVE) {
+		BlenderCamera bcam;
+		float aspectratio, sensor_size;
+		blender_camera_init(&bcam, b_render);
+		blender_camera_from_object(&bcam, b_engine, b_ob);
+		blender_camera_viewplane(&bcam,
+		                         width, height,
+		                         NULL,
+		                         &aspectratio,
+		                         &sensor_size);
+		/* TODO(sergey): De-duplicate calculation with camera sync. */
+		float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio);
+		if(fov != cam->fov) {
+			VLOG(1) << "Camera " << b_ob.name() << " FOV change detected.";
+			if(motion_time == -1.0f) {
+				cam->fov_pre = fov;
+				cam->use_perspective_motion = true;
+			}
+			else if(motion_time == 1.0f) {
+				cam->fov_post = fov;
+				cam->use_perspective_motion = true;
+			}
+		}
+	}
 }
 
 /* Sync 3D View Camera */
 
-static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
+static void blender_camera_view_subset(BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height, BoundBox2D *view_box, BoundBox2D *cam_box);
 
-static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height, bool skip_panorama = false)
+static void blender_camera_from_view(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height, bool skip_panorama = false)
 {
 	/* 3d view parameters */
 	bcam->nearclip = b_v3d.clip_start();
@@ -437,13 +534,13 @@ static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL:
 		BL::Object b_ob = (b_v3d.lock_camera_and_layers())? b_scene.camera(): b_v3d.camera();
 
 		if(b_ob) {
-			blender_camera_from_object(bcam, b_ob, skip_panorama);
+			blender_camera_from_object(bcam, b_engine, b_ob, skip_panorama);
 
 			if(!skip_panorama && bcam->type == CAMERA_PANORAMA) {
 				/* in panorama camera view, we map viewplane to camera border */
 				BoundBox2D view_box, cam_box;
 
-				blender_camera_view_subset(b_scene.render(), b_scene, b_ob, b_v3d, b_rv3d, width, height,
+				blender_camera_view_subset(b_engine, b_scene.render(), b_scene, b_ob, b_v3d, b_rv3d, width, height,
 					&view_box, &cam_box);
 
 				bcam->pano_viewplane = view_box.make_relative_to(cam_box);
@@ -481,7 +578,7 @@ static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL:
 	bcam->matrix = transform_inverse(get_transform(b_rv3d.view_matrix()));
 }
 
-static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
+static void blender_camera_view_subset(BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height, BoundBox2D *view_box, BoundBox2D *cam_box)
 {
 	BoundBox2D cam, view;
@@ -489,16 +586,16 @@ static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_
 
 	/* get viewport viewplane */
 	BlenderCamera view_bcam;
-	blender_camera_init(&view_bcam, b_render, b_scene);
-	blender_camera_from_view(&view_bcam, b_scene, b_v3d, b_rv3d, width, height, true);
+	blender_camera_init(&view_bcam, b_render);
+	blender_camera_from_view(&view_bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height, true);
 
 	blender_camera_viewplane(&view_bcam, width, height,
 		&view, &view_aspect, &sensor_size);
 
 	/* get camera viewplane */
 	BlenderCamera cam_bcam;
-	blender_camera_init(&cam_bcam, b_render, b_scene);
-	blender_camera_from_object(&cam_bcam, b_ob, true);
+	blender_camera_init(&cam_bcam, b_render);
+	blender_camera_from_object(&cam_bcam, b_engine, b_ob, true);
 
 	blender_camera_viewplane(&cam_bcam, cam_bcam.full_width, cam_bcam.full_height,
 		&cam, &cam_aspect, &sensor_size);
@@ -508,7 +605,27 @@ static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_
 	*cam_box = cam * (1.0f/cam_aspect);
 }
 
-static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d,
+static void blender_camera_border_subset(BL::RenderEngine b_engine,
+                                         BL::RenderSettings b_render,
+                                         BL::Scene b_scene,
+                                         BL::SpaceView3D b_v3d,
+                                         BL::RegionView3D b_rv3d,
+                                         BL::Object b_ob,
+                                         int width, int height,
+                                         const BoundBox2D &border,
+                                         BoundBox2D *result)
+{
+	/* Determine camera viewport subset. */
+	BoundBox2D view_box, cam_box;
+	blender_camera_view_subset(b_engine, b_render, b_scene, b_ob, b_v3d, b_rv3d, width, height,
+	                           &view_box, &cam_box);
+
+	/* Determine viewport subset matching given border. */
+	cam_box = cam_box.make_relative_to(view_box);
+	*result = cam_box.subset(border);
+}
+
+static void blender_camera_border(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height)
 {
 	bool is_camera_view;
@@ -526,47 +643,61 @@ static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_rend
 			bcam->border.right = b_v3d.render_border_max_x();
 			bcam->border.bottom = b_v3d.render_border_min_y();
 			bcam->border.top = b_v3d.render_border_max_y();
-
-			return;
 		}
-	}
-	else if(!b_render.use_border())
 		return;
+	}
 
 	BL::Object b_ob = (b_v3d.lock_camera_and_layers())? b_scene.camera(): b_v3d.camera();
 
 	if(!b_ob)
 		return;
 
+	/* Determine camera border inside the viewport. */
+	BoundBox2D full_border;
+	blender_camera_border_subset(b_engine,
+	                             b_render,
+	                             b_scene,
+	                             b_v3d,
+	                             b_rv3d,
+	                             b_ob,
+	                             width, height,
+	                             full_border,
+	                             &bcam->viewport_camera_border);
+
+	if(!b_render.use_border()) {
+		return;
+	}
+
 	bcam->border.left = b_render.border_min_x();
 	bcam->border.right = b_render.border_max_x();
 	bcam->border.bottom = b_render.border_min_y();
 	bcam->border.top = b_render.border_max_y();
 
-	/* determine camera viewport subset */
-	BoundBox2D view_box, cam_box;
-
-	blender_camera_view_subset(b_render, b_scene, b_ob, b_v3d, b_rv3d, width, height,
-		&view_box, &cam_box);
-
-	/* determine viewport subset matching camera border */
-	cam_box = cam_box.make_relative_to(view_box);
-	bcam->border = cam_box.subset(bcam->border).clamp();
+	/* Determine viewport subset matching camera border. */
+	blender_camera_border_subset(b_engine,
+	                             b_render,
+	                             b_scene,
+	                             b_v3d,
+	                             b_rv3d,
+	                             b_ob,
+	                             width, height,
+	                             bcam->border,
+	                             &bcam->border);
+	bcam->border.clamp();
 }
 
 void BlenderSync::sync_view(BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height)
 {
 	BlenderCamera bcam;
-	blender_camera_init(&bcam, b_scene.render(), b_scene);
-	blender_camera_from_view(&bcam, b_scene, b_v3d, b_rv3d, width, height);
-	blender_camera_border(&bcam, b_scene.render(), b_scene, b_v3d, b_rv3d, width, height);
+	blender_camera_init(&bcam, b_scene.render());
+	blender_camera_from_view(&bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height);
+	blender_camera_border(&bcam, b_engine, b_scene.render(), b_scene, b_v3d, b_rv3d, width, height);
 
 	blender_camera_sync(scene->camera, &bcam, width, height);
 	sync_view_nodes(b_rv3d);
-	scene->camera->use_camera_in_volume = experimental;
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height)
+BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height)
 {
 	BufferParams params;
 	bool use_border = false;
@@ -581,10 +712,12 @@ BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::Sce
 
 	if(use_border) {
 		/* border render */
-		params.full_x = (int)(cam->border.left * (float)width);
-		params.full_y = (int)(cam->border.bottom * (float)height);
-		params.width = (int)(cam->border.right * (float)width) - params.full_x;
-		params.height = (int)(cam->border.top * (float)height) - params.full_y;
+		/* the viewport may offset the border outside the view */
+		BoundBox2D border = cam->border.clamp();
+		params.full_x = (int)(border.left * (float)width);
+		params.full_y = (int)(border.bottom * (float)height);
+		params.width = (int)(border.right * (float)width) - params.full_x;
+		params.height = (int)(border.top * (float)height) - params.full_y;
 
 		/* survive in case border goes out of view or becomes too small */
 		params.width = max(params.width, 1);
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 8cfaea59a06..6a119081bfd 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "attribute.h"
@@ -25,6 +25,7 @@
 #include "blender_util.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,8 +44,8 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData);
 void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
                                float3 RotCam, bool is_ortho);
 void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution);
-void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
 
 ParticleCurveData::ParticleCurveData()
 {
@@ -140,17 +141,20 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				int mi = clamp(b_part.material()-1, 0, mesh->used_shaders.size()-1);
 				int shader = mesh->used_shaders[mi];
 				int draw_step = background ? b_part.render_step() : b_part.draw_step();
-				int ren_step = (int)powf(2.0f, (float)draw_step);
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
 				
-				if(b_part.child_type() == 0)
+				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
 				if(totcurves == 0)
 					continue;
 
+				int ren_step = (1 << draw_step) + 1;
+				if(b_part.kink() == BL::ParticleSettings::kink_SPIRAL)
+					ren_step += b_part.kink_extra_steps();
+
 				PointerRNA cpsys = RNA_pointer_get(&b_part.ptr, "cycles");
 
 				CData->psys_firstcurve.push_back(curvenum);
@@ -165,15 +169,15 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				CData->psys_closetip.push_back(get_boolean(cpsys, "use_closetip"));
 
 				int pa_no = 0;
-				if(!(b_part.child_type() == 0))
+				if(!(b_part.child_type() == 0) && totchild != 0)
 					pa_no = totparts;
 
 				int num_add = (totparts+totchild - pa_no);
 				CData->curve_firstkey.reserve(CData->curve_firstkey.size() + num_add);
 				CData->curve_keynum.reserve(CData->curve_keynum.size() + num_add);
 				CData->curve_length.reserve(CData->curve_length.size() + num_add);
-				CData->curvekey_co.reserve(CData->curvekey_co.size() + num_add*(ren_step+1));
-				CData->curvekey_time.reserve(CData->curvekey_time.size() + num_add*(ren_step+1));
+				CData->curvekey_co.reserve(CData->curvekey_co.size() + num_add*ren_step);
+				CData->curvekey_time.reserve(CData->curvekey_time.size() + num_add*ren_step);
 
 				for(; pa_no < totparts+totchild; pa_no++) {
 					int keynum = 0;
@@ -181,7 +185,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 					
 					float curve_length = 0.0f;
 					float3 pcKey;
-					for(int step_no = 0; step_no <= ren_step; step_no++) {
+					for(int step_no = 0; step_no < ren_step; step_no++) {
 						float nco[3];
 						b_psys.co_hair(*b_ob, pa_no, step_no, nco);
 						float3 cKey = make_float3(nco[0], nco[1], nco[2]);
@@ -195,9 +199,9 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 						CData->curvekey_co.push_back(cKey);
 						CData->curvekey_time.push_back(curve_length);
 						pcKey = cKey;
-						keyno++;
 						keynum++;
 					}
+					keyno += keynum;
 
 					CData->curve_keynum.push_back(keynum);
 					CData->curve_length.push_back(curve_length);
@@ -229,14 +233,14 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
 				
-				if (b_part.child_type() == 0)
+				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
-				if (totcurves == 0)
+				if(totcurves == 0)
 					continue;
 
 				int pa_no = 0;
-				if(!(b_part.child_type() == 0))
+				if(!(b_part.child_type() == 0) && totchild != 0)
 					pa_no = totparts;
 
 				int num_add = (totparts+totchild - pa_no);
@@ -283,14 +287,14 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
 				
-				if (b_part.child_type() == 0)
+				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
-				if (totcurves == 0)
+				if(totcurves == 0)
 					continue;
 
 				int pa_no = 0;
-				if(!(b_part.child_type() == 0))
+				if(!(b_part.child_type() == 0) && totchild != 0)
 					pa_no = totparts;
 
 				int num_add = (totparts+totchild - pa_no);
@@ -318,11 +322,11 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 	return true;
 }
 
-static void set_resolution(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, BL::Scene *scene, bool render)
+static void set_resolution(BL::Object *b_ob, BL::Scene *scene, bool render)
 {
 	BL::Object::modifiers_iterator b_mod;
 	for(b_ob->modifiers.begin(b_mod); b_mod != b_ob->modifiers.end(); ++b_mod) {
-		if ((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) && ((b_mod->show_viewport()) || (b_mod->show_render()))) {
+		if((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) && ((b_mod->show_viewport()) || (b_mod->show_render()))) {
 			BL::ParticleSystemModifier psmd((const PointerRNA)b_mod->ptr);
 			BL::ParticleSystem b_psys((const PointerRNA)psmd.particle_system().ptr);
 			b_psys.set_resolution(*scene, *b_ob, (render)? 2: 1);
@@ -509,7 +513,7 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 
 				ybasis = normalize(cross(xbasis, v2));
 
-				for (; subv <= 1; subv++) {
+				for(; subv <= 1; subv++) {
 					float3 ickey_loc = make_float3(0.0f,0.0f,0.0f);
 					float time = 0.0f;
 
@@ -577,6 +581,10 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		}
 	}
 
+	if(num_curves > 0) {
+		VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
+	}
+
 	mesh->curve_keys.reserve(mesh->curve_keys.size() + num_keys);
 	mesh->curves.reserve(mesh->curves.size() + num_curves);
 
@@ -612,23 +620,27 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		}
 	}
 
-	/* check allocation*/
-	if((mesh->curve_keys.size() !=  num_keys) || (mesh->curves.size() !=  num_curves)) {
-		/* allocation failed -> clear data */
+	/* check allocation */
+	if((mesh->curve_keys.size() != num_keys) || (mesh->curves.size() != num_curves)) {
+		VLOG(1) << "Allocation failed, clearing data";
 		mesh->curve_keys.clear();
 		mesh->curves.clear();
 		mesh->curve_attributes.clear();
 	}
 }
 
-static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveData *CData, int time_index)
+static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int time_index)
 {
+	VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name
+	        << ", time index " << time_index;
+
 	/* find attribute */
 	Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 	bool new_attribute = false;
 
 	/* add new attribute if it doesn't exist already */
 	if(!attr_mP) {
+		VLOG(1) << "Creating new motion vertex position attribute";
 		attr_mP = mesh->curve_attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
 		new_attribute = true;
 	}
@@ -675,9 +687,12 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 	if(new_attribute) {
 		if(i != numkeys || !have_motion) {
 			/* no motion, remove attributes again */
+			VLOG(1) << "No motion, removing attribute";
 			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 		}
 		else if(time_index > 0) {
+			VLOG(1) << "Filling in new motion vertex position for time_index "
+			        << time_index;
 			/* motion, fill up previous steps that we might have skipped because
 			 * they had no motion, but we need them anyway now */
 			for(int step = 0; step < time_index; step++) {
@@ -690,7 +705,7 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 	}
 }
 
-void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
 {
 	if(uvdata == NULL)
 		return;
@@ -735,7 +750,7 @@ void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset
 	}
 }
 
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
 {
 	if(cdata == NULL)
 		return;
@@ -786,30 +801,38 @@ void BlenderSync::sync_curve_settings()
 	curve_system_manager->subdivisions = get_int(csscene, "subdivisions");
 	curve_system_manager->use_backfacing = !get_boolean(csscene, "cull_backfacing");
 
-	if(curve_system_manager->primitive == CURVE_TRIANGLES && curve_system_manager->curve_shape == CURVE_RIBBON) {
-		/* camera facing planes */
-		curve_system_manager->triangle_method = CURVE_CAMERA_TRIANGLES;
-		curve_system_manager->resolution = 1;
-	}
-	else if(curve_system_manager->primitive == CURVE_TRIANGLES && curve_system_manager->curve_shape == CURVE_THICK) {
+	/* Triangles */
+	if(curve_system_manager->primitive == CURVE_TRIANGLES) {
 		/* camera facing planes */
-		curve_system_manager->triangle_method = CURVE_TESSELATED_TRIANGLES;
-	}
-	else if(curve_system_manager->primitive == CURVE_LINE_SEGMENTS && curve_system_manager->curve_shape == CURVE_RIBBON) {
-		/* tangent shading */
-		curve_system_manager->line_method = CURVE_UNCORRECTED;
-		curve_system_manager->use_encasing = true;
-		curve_system_manager->use_backfacing = false;
-		curve_system_manager->use_tangent_normal_geometry = true;
+		if(curve_system_manager->curve_shape == CURVE_RIBBON) {
+			curve_system_manager->triangle_method = CURVE_CAMERA_TRIANGLES;
+			curve_system_manager->resolution = 1;
+		}
+		else if(curve_system_manager->curve_shape == CURVE_THICK) {
+			curve_system_manager->triangle_method = CURVE_TESSELATED_TRIANGLES;
+		}
 	}
-	else if(curve_system_manager->primitive == CURVE_LINE_SEGMENTS && curve_system_manager->curve_shape == CURVE_THICK) {
-		curve_system_manager->line_method = CURVE_ACCURATE;
-		curve_system_manager->use_encasing = false;
-		curve_system_manager->use_tangent_normal_geometry = false;
+	/* Line Segments */
+	else if(curve_system_manager->primitive == CURVE_LINE_SEGMENTS) {
+		if(curve_system_manager->curve_shape == CURVE_RIBBON) {
+			/* tangent shading */
+			curve_system_manager->line_method = CURVE_UNCORRECTED;
+			curve_system_manager->use_encasing = true;
+			curve_system_manager->use_backfacing = false;
+			curve_system_manager->use_tangent_normal_geometry = true;
+		}
+		else if(curve_system_manager->curve_shape == CURVE_THICK) {
+			curve_system_manager->line_method = CURVE_ACCURATE;
+			curve_system_manager->use_encasing = false;
+			curve_system_manager->use_tangent_normal_geometry = false;
+		}
 	}
-	else if(curve_system_manager->primitive == CURVE_SEGMENTS && curve_system_manager->curve_shape == CURVE_RIBBON) {
-		curve_system_manager->primitive = CURVE_RIBBONS;
-		curve_system_manager->use_backfacing = false;
+	/* Curve Segments */
+	else if(curve_system_manager->primitive == CURVE_SEGMENTS) {
+		if(curve_system_manager->curve_shape == CURVE_RIBBON) {
+			curve_system_manager->primitive = CURVE_RIBBONS;
+			curve_system_manager->use_backfacing = false;
+		}
 	}
 
 	if(curve_system_manager->modified_mesh(prev_curve_system_manager)) {
@@ -863,7 +886,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	ParticleCurveData CData;
 
 	if(!preview)
-		set_resolution(mesh, &b_mesh, &b_ob, &b_scene, true);
+		set_resolution(&b_ob, &b_scene, true);
 
 	ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview);
 
@@ -894,7 +917,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	}
 	else {
 		if(motion)
-			ExportCurveSegmentsMotion(scene, mesh, &CData, time_index);
+			ExportCurveSegmentsMotion(mesh, &CData, time_index);
 		else
 			ExportCurveSegments(scene, mesh, &CData);
 	}
@@ -943,7 +966,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 
 				uchar4 *cdata = attr_vcol->data_uchar4();
 
-				ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, cdata);
+				ExportCurveTriangleVcol(&CData, tri_num * 3, used_res, cdata);
 			}
 			else {
 				Attribute *attr_vcol = mesh->curve_attributes.add(
@@ -986,7 +1009,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 
 					float3 *uv = attr_uv->data_float3();
 
-					ExportCurveTriangleUV(mesh, &CData, tri_num * 3, used_res, uv);
+					ExportCurveTriangleUV(&CData, tri_num * 3, used_res, uv);
 				}
 				else {
 					if(active_render)
@@ -1009,7 +1032,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	}
 
 	if(!preview)
-		set_resolution(mesh, &b_mesh, &b_ob, &b_scene, false);
+		set_resolution(&b_ob, &b_scene, false);
 
 	mesh->compute_bounds();
 }
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
index d3f1accf099..f4f86929168 100644
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -11,55 +11,23 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "CCL_api.h"
-
-#include <stdio.h>
-
 #include "util_logging.h"
 
-#ifdef _MSC_VER
-#  define snprintf _snprintf
-#endif
-
 void CCL_init_logging(const char *argv0)
 {
-#ifdef WITH_CYCLES_LOGGING
-	/* Make it so FATAL messages are always print into console. */
-	char severity_fatal[32];
-	snprintf(severity_fatal, sizeof(severity_fatal), "%d",
-	         google::GLOG_FATAL);
-
-	google::InitGoogleLogging(argv0);
-	google::SetCommandLineOption("logtostderr", "1");
-	google::SetCommandLineOption("v", "0");
-	google::SetCommandLineOption("stderrthreshold", severity_fatal);
-	google::SetCommandLineOption("minloglevel", severity_fatal);
-#else
-	(void) argv0;
-#endif
+	ccl::util_logging_init(argv0);
 }
 
 void CCL_start_debug_logging(void)
 {
-#ifdef WITH_CYCLES_LOGGING
-	google::SetCommandLineOption("logtostderr", "1");
-	google::SetCommandLineOption("v", "2");
-	google::SetCommandLineOption("stderrthreshold", "1");
-	google::SetCommandLineOption("minloglevel", "0");
-#endif
+	ccl::util_logging_start();
 }
 
 void CCL_logging_verbosity_set(int verbosity)
 {
-#ifdef WITH_CYCLES_LOGGING
-	char val[10];
-	snprintf(val, sizeof(val), "%d", verbosity);
-
-	google::SetCommandLineOption("v", val);
-#else
-	(void) verbosity;
-#endif
+	ccl::util_logging_verbosity_set(verbosity);
 }
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index a5e4b7bd2ae..7135e938afb 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
  
@@ -20,6 +20,7 @@
 #include "scene.h"
 
 #include "blender_sync.h"
+#include "blender_session.h"
 #include "blender_util.h"
 
 #include "subd_mesh.h"
@@ -27,6 +28,8 @@
 #include "subd_split.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
+#include "util_math.h"
 
 #include "mikktspace.h"
 
@@ -104,7 +107,9 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float
 		int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num];
 		float3 orco =
 			get_float3(userdata->mesh.vertices[vert_idx].undeformed_co());
-		map_to_sphere(&uv[0], &uv[1], orco[0], orco[1], orco[2]);
+		float2 tmp = map_to_sphere(make_float3(orco[0], orco[1], orco[2]));
+		uv[0] = tmp.x;
+		uv[1] = tmp.y;
 	}
 }
 
@@ -135,7 +140,7 @@ static void mikk_set_tangent_space(const SMikkTSpaceContext *context, const floa
 	userdata->tangent[face*4 + vert] = make_float4(T[0], T[1], T[2], sign);
 }
 
-static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer *b_layer, Mesh *mesh, vector<int>& nverts, bool need_sign, bool active_render)
+static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer *b_layer, Mesh *mesh, const vector<int>& nverts, bool need_sign, bool active_render)
 {
 	/* setup userdata */
 	MikkUserData userdata(b_mesh, b_layer, nverts.size());
@@ -237,8 +242,16 @@ static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManag
 	bool animated = false;
 
 	volume_data->manager = image_manager;
-	volume_data->slot = image_manager->add_image(Attribute::standard_name(std),
-		b_ob.ptr.data, animated, frame, is_float, is_linear, INTERPOLATION_LINEAR, true);
+	volume_data->slot = image_manager->add_image(
+	        Attribute::standard_name(std),
+	        b_ob.ptr.data,
+	        animated,
+	        frame,
+	        is_float,
+	        is_linear,
+	        INTERPOLATION_LINEAR,
+	        EXTENSION_REPEAT,
+	        true);
 }
 
 static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh, float frame)
@@ -256,6 +269,167 @@ static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *m
 		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame);
 }
 
+/* Create vertex color attributes. */
+static void attr_create_vertex_color(Scene *scene,
+                                     Mesh *mesh,
+                                     BL::Mesh b_mesh,
+                                     const vector<int>& nverts)
+{
+	BL::Mesh::tessface_vertex_colors_iterator l;
+	for(b_mesh.tessface_vertex_colors.begin(l); l != b_mesh.tessface_vertex_colors.end(); ++l) {
+		if(!mesh->need_attribute(scene, ustring(l->name().c_str())))
+			continue;
+
+		Attribute *attr = mesh->attributes.add(
+			ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+
+		BL::MeshColorLayer::data_iterator c;
+		uchar4 *cdata = attr->data_uchar4();
+		size_t i = 0;
+
+		for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
+			cdata[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
+			cdata[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
+			cdata[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+
+			if(nverts[i] == 4) {
+				cdata[3] = cdata[0];
+				cdata[4] = cdata[2];
+				cdata[5] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+				cdata += 6;
+			}
+			else
+				cdata += 3;
+		}
+	}
+}
+
+/* Create uv map attributes. */
+static void attr_create_uv_map(Scene *scene,
+                               Mesh *mesh,
+                               BL::Mesh b_mesh,
+                               const vector<int>& nverts)
+{
+	if(b_mesh.tessface_uv_textures.length() != 0) {
+		BL::Mesh::tessface_uv_textures_iterator l;
+
+		for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
+			bool active_render = l->active_render();
+			AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
+			ustring name = ustring(l->name().c_str());
+
+			/* UV map */
+			if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
+				Attribute *attr;
+
+				if(active_render)
+					attr = mesh->attributes.add(std, name);
+				else
+					attr = mesh->attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+
+				BL::MeshTextureFaceLayer::data_iterator t;
+				float3 *fdata = attr->data_float3();
+				size_t i = 0;
+
+				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
+					fdata[0] = get_float3(t->uv1());
+					fdata[1] = get_float3(t->uv2());
+					fdata[2] = get_float3(t->uv3());
+					fdata += 3;
+
+					if(nverts[i] == 4) {
+						fdata[0] = get_float3(t->uv1());
+						fdata[1] = get_float3(t->uv3());
+						fdata[2] = get_float3(t->uv4());
+						fdata += 3;
+					}
+				}
+			}
+
+			/* UV tangent */
+			std = (active_render)? ATTR_STD_UV_TANGENT: ATTR_STD_NONE;
+			name = ustring((string(l->name().c_str()) + ".tangent").c_str());
+
+			if(mesh->need_attribute(scene, name) || (active_render && mesh->need_attribute(scene, std))) {
+				std = (active_render)? ATTR_STD_UV_TANGENT_SIGN: ATTR_STD_NONE;
+				name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str());
+				bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std));
+
+				mikk_compute_tangents(b_mesh, &(*l), mesh, nverts, need_sign, active_render);
+			}
+		}
+	}
+	else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) {
+		bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN);
+		mikk_compute_tangents(b_mesh, NULL, mesh, nverts, need_sign, true);
+	}
+}
+
+/* Create vertex pointiness attributes. */
+static void attr_create_pointiness(Scene *scene,
+                                   Mesh *mesh,
+                                   BL::Mesh b_mesh)
+{
+	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
+		const int numverts = b_mesh.vertices.length();
+		Attribute *attr = mesh->attributes.add(ATTR_STD_POINTINESS);
+		float *data = attr->data_float();
+		int *counter = new int[numverts];
+		float *raw_data = new float[numverts];
+		float3 *edge_accum = new float3[numverts];
+
+		/* Calculate pointiness using single ring neighborhood. */
+		memset(counter, 0, sizeof(int) * numverts);
+		memset(raw_data, 0, sizeof(float) * numverts);
+		memset(edge_accum, 0, sizeof(float3) * numverts);
+		BL::Mesh::edges_iterator e;
+		int i = 0;
+		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
+			int v0 = b_mesh.edges[i].vertices()[0],
+			    v1 = b_mesh.edges[i].vertices()[1];
+			float3 co0 = get_float3(b_mesh.vertices[v0].co()),
+			       co1 = get_float3(b_mesh.vertices[v1].co());
+			float3 edge = normalize(co1 - co0);
+			edge_accum[v0] += edge;
+			edge_accum[v1] += -edge;
+			++counter[v0];
+			++counter[v1];
+		}
+		i = 0;
+		BL::Mesh::vertices_iterator v;
+		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) {
+			if(counter[i] > 0) {
+				float3 normal = get_float3(b_mesh.vertices[i].normal());
+				float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i]));
+				raw_data[i] = angle * M_1_PI_F;
+			}
+			else {
+				raw_data[i] = 0.0f;
+			}
+		}
+
+		/* Blur vertices to approximate 2 ring neighborhood. */
+		memset(counter, 0, sizeof(int) * numverts);
+		memcpy(data, raw_data, sizeof(float) * numverts);
+		i = 0;
+		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
+			int v0 = b_mesh.edges[i].vertices()[0],
+			    v1 = b_mesh.edges[i].vertices()[1];
+			data[v0] += raw_data[v1];
+			data[v1] += raw_data[v0];
+			++counter[v0];
+			++counter[v1];
+		}
+		for(i = 0; i < numverts; ++i) {
+			data[i] /= counter[i] + 1;
+		}
+
+		delete [] counter;
+		delete [] raw_data;
+		delete [] edge_accum;
+	}
+}
+
 /* Create Mesh */
 
 static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<uint>& used_shaders)
@@ -303,6 +477,9 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
 	}
 
+	/* Create needed vertex attributes. */
+	attr_create_pointiness(scene, mesh, b_mesh);
+
 	/* create faces */
 	vector<int> nverts(numfaces);
 	int fi = 0, ti = 0;
@@ -312,7 +489,7 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 		int n = (vi[3] == 0)? 3: 4;
 		int mi = clamp(f->material_index(), 0, used_shaders.size()-1);
 		int shader = used_shaders[mi];
-		bool smooth = f->use_smooth();
+		bool smooth = f->use_smooth() || use_loop_normals;
 
 		/* split vertices if normal is different
 		 *
@@ -354,92 +531,11 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 		nverts[fi] = n;
 	}
 
-	/* create vertex color attributes */
-	{
-		BL::Mesh::tessface_vertex_colors_iterator l;
-
-		for(b_mesh.tessface_vertex_colors.begin(l); l != b_mesh.tessface_vertex_colors.end(); ++l) {
-			if(!mesh->need_attribute(scene, ustring(l->name().c_str())))
-				continue;
-
-			Attribute *attr = mesh->attributes.add(
-				ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
-
-			BL::MeshColorLayer::data_iterator c;
-			uchar4 *cdata = attr->data_uchar4();
-			size_t i = 0;
-
-			for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
-				cdata[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
-				cdata[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
-				cdata[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
-
-				if(nverts[i] == 4) {
-					cdata[3] = cdata[0];
-					cdata[4] = cdata[2];
-					cdata[5] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
-					cdata += 6;
-				}
-				else
-					cdata += 3;
-			}
-		}
-	}
-
-	/* create uv map attributes */
-	if (b_mesh.tessface_uv_textures.length() != 0) {
-		BL::Mesh::tessface_uv_textures_iterator l;
-
-		for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
-			bool active_render = l->active_render();
-			AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
-			ustring name = ustring(l->name().c_str());
-
-			/* UV map */
-			if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
-				Attribute *attr;
-
-				if(active_render)
-					attr = mesh->attributes.add(std, name);
-				else
-					attr = mesh->attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
-
-				BL::MeshTextureFaceLayer::data_iterator t;
-				float3 *fdata = attr->data_float3();
-				size_t i = 0;
-
-				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
-					fdata[0] =  get_float3(t->uv1());
-					fdata[1] =  get_float3(t->uv2());
-					fdata[2] =  get_float3(t->uv3());
-					fdata += 3;
-
-					if(nverts[i] == 4) {
-						fdata[0] =  get_float3(t->uv1());
-						fdata[1] =  get_float3(t->uv3());
-						fdata[2] =  get_float3(t->uv4());
-						fdata += 3;
-					}
-				}
-			}
-
-			/* UV tangent */
-			std = (active_render)? ATTR_STD_UV_TANGENT: ATTR_STD_NONE;
-			name = ustring((string(l->name().c_str()) + ".tangent").c_str());
-
-			if(mesh->need_attribute(scene, name) || (active_render && mesh->need_attribute(scene, std))) {
-				std = (active_render)? ATTR_STD_UV_TANGENT_SIGN: ATTR_STD_NONE;
-				name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str());
-				bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std));
-
-				mikk_compute_tangents(b_mesh, &(*l), mesh, nverts, need_sign, active_render);
-			}
-		}
-	}
-	else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) {
-		bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN);
-		mikk_compute_tangents(b_mesh, NULL, mesh, nverts, need_sign, true);
-	}
+	/* Create all needed attributes.
+	 * The calculate functions will check whether they're needed or not.
+	 */
+	attr_create_vertex_color(scene, mesh, b_mesh, nverts);
+	attr_create_uv_map(scene, mesh, b_mesh, nverts);
 
 	/* for volume objects, create a matrix to transform from object space to
 	 * mesh texture space. this does not work with deformations but that can
@@ -501,6 +597,14 @@ static void create_subd_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, PointerR
 
 Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tris)
 {
+	/* When viewport display is not needed during render we can force some
+	 * caches to be releases from blender side in order to reduce peak memory
+	 * footprint during synchronization process.
+	 */
+	const bool is_interface_locked = b_engine.render() &&
+	                                 b_engine.render().use_lock_interface();
+	const bool can_free_caches = BlenderSession::headless || is_interface_locked;
+
 	/* test if we can instance or if the object is modified */
 	BL::ID b_ob_data = b_ob.data();
 	BL::ID key = (BKE_object_is_modified(b_ob))? b_ob: b_ob_data;
@@ -525,7 +629,13 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	}
 	
 	/* test if we need to sync */
-	bool use_mesh_geometry = render_layer.use_surfaces || render_layer.use_hair;
+	int requested_geometry_flags = Mesh::GEOMETRY_NONE;
+	if(render_layer.use_surfaces) {
+		requested_geometry_flags |= Mesh::GEOMETRY_TRIANGLES;
+	}
+	if(render_layer.use_hair) {
+		requested_geometry_flags |= Mesh::GEOMETRY_CURVES;
+	}
 	Mesh *mesh;
 
 	if(!mesh_map.sync(&mesh, key)) {
@@ -534,7 +644,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 		/* test if shaders changed, these can be object level so mesh
 		 * does not get tagged for recalc */
 		else if(mesh->used_shaders != used_shaders);
-		else if(use_mesh_geometry != mesh->geometry_synced);
+		else if(requested_geometry_flags != mesh->geometry_flags);
 		else {
 			/* even if not tagged for recalc, we may need to sync anyway
 			 * because the shader needs different mesh attributes */
@@ -568,7 +678,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	mesh->used_shaders = used_shaders;
 	mesh->name = ustring(b_ob_data.name().c_str());
 
-	if(use_mesh_geometry) {
+	if(requested_geometry_flags != Mesh::GEOMETRY_NONE) {
 		/* mesh objects does have special handle in the dependency graph,
 		 * they're ensured to have properly updated.
 		 *
@@ -594,11 +704,15 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 			if(render_layer.use_hair)
 				sync_curves(mesh, b_mesh, b_ob, false);
 
+			if(can_free_caches) {
+				b_ob.cache_release();
+			}
+
 			/* free derived mesh */
 			b_data.meshes.remove(b_mesh);
 		}
-		mesh->geometry_synced = true;
 	}
+	mesh->geometry_flags = requested_geometry_flags;
 
 	/* displacement method */
 	if(cmesh.data) {
@@ -728,6 +842,7 @@ void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion
 		return;
 	}
 
+	/* TODO(sergey): Perform preliminary check for number of verticies. */
 	if(numverts) {
 		/* find attributes */
 		Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
@@ -759,13 +874,17 @@ void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion
 
 		/* in case of new attribute, we verify if there really was any motion */
 		if(new_attribute) {
-			if(i != numverts || memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0) {
+			if(b_mesh.vertices.length() != numverts ||
+			   memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0)
+			{
 				/* no motion, remove attributes again */
+				VLOG(1) << "No actual deformation motion for object " << b_ob.name();
 				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 				if(attr_mN)
 					mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL);
 			}
 			else if(time_index > 0) {
+				VLOG(1) << "Filling deformation motion for object " << b_ob.name();
 				/* motion, fill up previous steps that we might have skipped because
 				 * they had no motion, but we need them anyway now */
 				float3 *P = &mesh->verts[0];
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 1e07c5f9c96..432c4aaa078 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -30,6 +30,7 @@
 
 #include "util_foreach.h"
 #include "util_hash.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -89,14 +90,17 @@ static uint object_ray_visibility(BL::Object b_ob)
 
 /* Light */
 
-void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm)
+void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm, bool *use_portal)
 {
 	/* test if we need to sync */
 	Light *light;
 	ObjectKey key(b_parent, persistent_id, b_ob);
 
-	if(!light_map.sync(&light, b_ob, b_parent, key))
+	if(!light_map.sync(&light, b_ob, b_parent, key)) {
+		if(light->is_portal)
+			*use_portal = true;
 		return;
+	}
 	
 	BL::Lamp b_lamp(b_ob.data());
 
@@ -168,6 +172,16 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
 	else
 		light->samples = samples;
 
+	light->max_bounces = get_int(clamp, "max_bounces");
+
+	if(light->type == LIGHT_AREA)
+		light->is_portal = get_boolean(clamp, "is_portal");
+	else
+		light->is_portal = false;
+
+	if(light->is_portal)
+		*use_portal = true;
+
 	/* visibility */
 	uint visibility = object_ray_visibility(b_ob);
 	light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
@@ -179,7 +193,7 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
 	light->tag_update(scene);
 }
 
-void BlenderSync::sync_background_light()
+void BlenderSync::sync_background_light(bool use_portal)
 {
 	BL::World b_world = b_scene.world();
 
@@ -188,19 +202,21 @@ void BlenderSync::sync_background_light()
 		PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 		bool sample_as_light = get_boolean(cworld, "sample_as_light");
 
-		if(sample_as_light) {
+		if(sample_as_light || use_portal) {
 			/* test if we need to sync */
 			Light *light;
 			ObjectKey key(b_world, 0, b_world);
 
 			if(light_map.sync(&light, b_world, b_world, key) ||
-			   world_recalc ||
-			   b_world.ptr.data != world_map)
+				world_recalc ||
+				b_world.ptr.data != world_map)
 			{
 				light->type = LIGHT_BACKGROUND;
 				light->map_resolution  = get_int(cworld, "sample_map_resolution");
 				light->shader = scene->default_background;
-				
+				light->use_mis = sample_as_light;
+				light->max_bounces = get_int(cworld, "max_bounces");
+
 				int samples = get_int(cworld, "samples");
 				if(get_boolean(cscene, "use_square_samples"))
 					light->samples = samples * samples;
@@ -219,8 +235,56 @@ void BlenderSync::sync_background_light()
 
 /* Object */
 
-Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
-                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris)
+/* TODO(sergey): Not really optimal, consider approaches based on k-DOP in order
+ * to reduce number of objects which are wrongly considered visible.
+ */
+static bool object_boundbox_clip(Scene *scene,
+                                 BL::Object b_ob,
+                                 Transform& tfm,
+                                 float margin)
+{
+	Camera *cam = scene->camera;
+	Transform& worldtondc = cam->worldtondc;
+	BL::Array<float, 24> boundbox = b_ob.bound_box();
+	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+	bool all_behind = true;
+	for(int i = 0; i < 8; ++i) {
+		float3 p = make_float3(boundbox[3 * i + 0],
+		                       boundbox[3 * i + 1],
+		                       boundbox[3 * i + 2]);
+		p = transform_point(&tfm, p);
+		p = transform_point(&worldtondc, p);
+		if(p.z >= -margin) {
+			all_behind = false;
+		}
+		p /= p.z;
+		bb_min = min(bb_min, p);
+		bb_max = max(bb_max, p);
+	}
+	if(!all_behind) {
+		if(bb_min.x >= 1.0f + margin ||
+		   bb_min.y >= 1.0f + margin ||
+		   bb_max.x <= -margin ||
+		   bb_max.y <= -margin)
+		{
+			return true;
+		}
+		return false;
+	}
+	return true;
+}
+
+Object *BlenderSync::sync_object(BL::Object b_parent,
+                                 int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
+                                 BL::DupliObject b_dupli_ob,
+                                 Transform& tfm,
+                                 uint layer_flag,
+                                 float motion_time,
+                                 bool hide_tris,
+                                 bool use_camera_cull,
+                                 float camera_cull_margin,
+                                 bool *use_portal)
 {
 	BL::Object b_ob = (b_dupli_ob ? b_dupli_ob.object() : b_parent);
 	bool motion = motion_time != 0.0f;
@@ -229,7 +293,7 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(object_is_light(b_ob)) {
 		/* don't use lamps for excluded layers used as mask layer */
 		if(!motion && !((layer_flag & render_layer.holdout_layer) && (layer_flag & render_layer.exclude_layer)))
-			sync_light(b_parent, persistent_id, b_ob, tfm);
+			sync_light(b_parent, persistent_id, b_ob, tfm, use_portal);
 
 		return NULL;
 	}
@@ -238,6 +302,11 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(!object_is_mesh(b_ob))
 		return NULL;
 
+	/* Perform camera space culling. */
+	if(use_camera_cull && object_boundbox_clip(scene, b_ob, tfm, camera_cull_margin)) {
+		return NULL;
+	}
+
 	/* key to lookup object */
 	ObjectKey key(b_parent, persistent_id, b_ob);
 	Object *object;
@@ -246,9 +315,12 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(motion) {
 		object = object_map.find(key);
 
-		if(object && (scene->need_motion() == Scene::MOTION_PASS || object_use_motion(b_ob))) {
+		if(object && (scene->need_motion() == Scene::MOTION_PASS ||
+		              object_use_motion(b_parent, b_ob)))
+		{
 			/* object transformation */
 			if(tfm != object->tfm) {
+				VLOG(1) << "Object " << b_ob.name() << " motion detected.";
 				if(motion_time == -1.0f) {
 					object->motion.pre = tfm;
 					object->use_motion = true;
@@ -326,8 +398,8 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 
 			mesh->use_motion_blur = false;
 
-			if(object_use_motion(b_ob)) {
-				if(object_use_deform_motion(b_ob)) {
+			if(object_use_motion(b_parent, b_ob)) {
+				if(object_use_deform_motion(b_parent, b_ob)) {
 					mesh->motion_steps = object_motion_steps(b_ob);
 					mesh->use_motion_blur = true;
 				}
@@ -352,7 +424,7 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 			object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
 
 		/* dupli texture coordinates */
-		if (b_dupli_ob) {
+		if(b_dupli_ob) {
 			object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f);
 			object->dupli_uv = get_float2(b_dupli_ob.uv());
 		}
@@ -410,9 +482,17 @@ static bool object_render_hide(BL::Object b_ob, bool top_level, bool parent_hide
 
 	/* hide original object for duplis */
 	BL::Object parent = b_ob.parent();
-	if(parent && object_render_hide_original(b_ob.type(), parent.dupli_type()))
-		if(parent_hide)
-			hide_as_dupli_child_original = true;
+	while(parent) {
+		if(object_render_hide_original(b_ob.type(),
+		                               parent.dupli_type()))
+		{
+			if(parent_hide) {
+				hide_as_dupli_child_original = true;
+				break;
+			}
+		}
+		parent = parent.parent();
+	}
 	
 	hide_triangles = hide_emitter;
 
@@ -454,16 +534,29 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 		mesh_motion_synced.clear();
 	}
 
+	bool allow_camera_cull = false;
+	float camera_cull_margin = 0.0f;
+	if(b_scene.render().use_simplify()) {
+		PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+		allow_camera_cull = scene->camera->type != CAMERA_PANORAMA &&
+		                    !b_scene.render().use_multiview() &&
+		                    get_boolean(cscene, "use_camera_cull");
+		if(allow_camera_cull) {
+			camera_cull_margin = get_float(cscene, "camera_cull_margin");
+		}
+	}
+
 	/* object loop */
 	BL::Scene::object_bases_iterator b_base;
 	BL::Scene b_sce = b_scene;
 	/* modifier result type (not exposed as enum in C++ API)
-     * 1 : DAG_EVAL_PREVIEW
-     * 2 : DAG_EVAL_RENDER
-     */
-    int dupli_settings = preview ? 1 : 2;
+	 * 1 : DAG_EVAL_PREVIEW
+	 * 2 : DAG_EVAL_RENDER
+	 */
+	int dupli_settings = preview ? 1 : 2;
 
 	bool cancel = false;
+	bool use_portal = false;
 
 	for(; b_sce && !cancel; b_sce = b_sce.background_set()) {
 		for(b_sce.object_bases.begin(b_base); b_base != b_sce.object_bases.end() && !cancel; ++b_base) {
@@ -475,6 +568,12 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 			if(!hide) {
 				progress.set_sync_status("Synchronizing object", b_ob.name());
 
+				PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+				bool use_camera_cull = allow_camera_cull && get_boolean(cobject, "use_camera_cull");
+				if(use_camera_cull) {
+					/* Need to have proper projection matrix. */
+					scene->camera->update();
+				}
 				if(b_ob.is_duplicator() && !object_render_hide_duplis(b_ob)) {
 					/* dupli objects */
 					b_ob.dupli_list_create(b_scene, dupli_settings);
@@ -494,7 +593,16 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 							BL::Array<int, OBJECT_PERSISTENT_ID_SIZE> persistent_id = b_dup->persistent_id();
 
 							/* sync object and mesh or light data */
-							Object *object = sync_object(b_ob, persistent_id.data, *b_dup, tfm, ob_layer, motion_time, hide_tris);
+							Object *object = sync_object(b_ob,
+							                             persistent_id.data,
+							                             *b_dup,
+							                             tfm,
+							                             ob_layer,
+							                             motion_time,
+							                             hide_tris,
+							                             use_camera_cull,
+							                             camera_cull_margin,
+							                             &use_portal);
 
 							/* sync possible particle data, note particle_id
 							 * starts counting at 1, first is dummy particle */
@@ -514,7 +622,16 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 				if(!object_render_hide(b_ob, true, true, hide_tris)) {
 					/* object itself */
 					Transform tfm = get_transform(b_ob.matrix_world());
-					sync_object(b_ob, NULL, PointerRNA_NULL, tfm, ob_layer, motion_time, hide_tris);
+					sync_object(b_ob,
+					            NULL,
+					            PointerRNA_NULL,
+					            tfm,
+					            ob_layer,
+					            motion_time,
+					            hide_tris,
+					            use_camera_cull,
+					            camera_cull_margin,
+					            &use_portal);
 				}
 			}
 
@@ -525,7 +642,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 	progress.set_sync_status("");
 
 	if(!cancel && !motion) {
-		sync_background_light();
+		sync_background_light(use_portal);
 
 		/* handle removed data and modified pointers */
 		if(light_map.post_sync())
@@ -542,7 +659,11 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 		mesh_motion_synced.clear();
 }
 
-void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void **python_thread_state)
+void BlenderSync::sync_motion(BL::RenderSettings b_render,
+                              BL::SpaceView3D b_v3d,
+                              BL::Object b_override,
+                              int width, int height,
+                              void **python_thread_state)
 {
 	if(scene->need_motion() == Scene::MOTION_NONE)
 		return;
@@ -562,6 +683,9 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void
 
 	/* note iteration over motion_times set happens in sorted order */
 	foreach(float relative_time, motion_times) {
+		VLOG(1) << "Synchronizing motion for the relative time "
+		        << relative_time << ".";
+
 		/* fixed shutter time to get previous and next frame for motion pass */
 		float shuttertime;
 
@@ -581,8 +705,12 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void
 		python_thread_state_save(python_thread_state);
 
 		/* sync camera, only supports two times at the moment */
-		if(relative_time == -1.0f || relative_time == 1.0f)
-			sync_camera_motion(b_cam, relative_time);
+		if(relative_time == -1.0f || relative_time == 1.0f) {
+			sync_camera_motion(b_render,
+			                   b_cam,
+			                   width, height,
+			                   relative_time);
+		}
 
 		/* sync object */
 		sync_objects(b_v3d, relative_time);
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index 5b2782ec2ac..6d799e6e10e 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "mesh.h"
@@ -76,7 +76,7 @@ bool BlenderSync::sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Ob
 
 	psys->particles.push_back(pa);
 
-	if (object->particle_index != psys->particles.size() - 1)
+	if(object->particle_index != psys->particles.size() - 1)
 		scene->object_manager->tag_update(scene);
 	object->particle_system = psys;
 	object->particle_index = psys->particles.size() - 1;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index b756d6acdb2..6581d0b997e 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <Python.h>
@@ -22,9 +22,11 @@
 #include "blender_session.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_md5.h"
 #include "util_opengl.h"
 #include "util_path.h"
+#include "util_types.h"
 
 #ifdef WITH_OSL
 #include "osl.h"
@@ -53,25 +55,53 @@ void python_thread_state_restore(void **python_thread_state)
 	*python_thread_state = NULL;
 }
 
-static PyObject *init_func(PyObject *self, PyObject *args)
+static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
 {
-	const char *path, *user_path;
+#ifdef WIN32
+	/* bug [#31856] oddly enough, Python3.2 --> 3.3 on Windows will throw an
+	 * exception here this needs to be fixed in python:
+	 * see: bugs.python.org/issue15859 */
+	if(!PyUnicode_Check(py_str)) {
+		PyErr_BadArgument();
+		return "";
+	}
+#endif
+	if((*coerce = PyUnicode_EncodeFSDefault(py_str))) {
+		return PyBytes_AS_STRING(*coerce);
+	}
+	return "";
+}
 
-	if(!PyArg_ParseTuple(args, "ss", &path, &user_path))
+static PyObject *init_func(PyObject * /*self*/, PyObject *args)
+{
+	PyObject *path, *user_path;
+	int headless;
+
+	if(!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
 		return NULL;
-	
-	path_init(path, user_path);
+	}
+
+	PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+	path_init(PyC_UnicodeAsByte(path, &path_coerce),
+	          PyC_UnicodeAsByte(user_path, &user_path_coerce));
+	Py_XDECREF(path_coerce);
+	Py_XDECREF(user_path_coerce);
+
+	BlenderSession::headless = headless;
 
 	Py_RETURN_NONE;
 }
 
-static PyObject *create_func(PyObject *self, PyObject *args)
+static PyObject *create_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pyengine, *pyuserpref, *pydata, *pyscene, *pyregion, *pyv3d, *pyrv3d;
 	int preview_osl;
 
-	if(!PyArg_ParseTuple(args, "OOOOOOOi", &pyengine, &pyuserpref, &pydata, &pyscene, &pyregion, &pyv3d, &pyrv3d, &preview_osl))
+	if(!PyArg_ParseTuple(args, "OOOOOOOi", &pyengine, &pyuserpref, &pydata, &pyscene,
+	                     &pyregion, &pyv3d, &pyrv3d, &preview_osl))
+	{
 		return NULL;
+	}
 
 	/* RNA */
 	PointerRNA engineptr;
@@ -83,7 +113,7 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	BL::UserPreferences userpref(userprefptr);
 
 	PointerRNA dataptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pydata), &dataptr);
+	RNA_main_pointer_create((Main*)PyLong_AsVoidPtr(pydata), &dataptr);
 	BL::BlendData data(dataptr);
 
 	PointerRNA sceneptr;
@@ -91,15 +121,15 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	BL::Scene scene(sceneptr);
 
 	PointerRNA regionptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyregion), &regionptr);
+	RNA_pointer_create(NULL, &RNA_Region, pylong_as_voidptr_typesafe(pyregion), &regionptr);
 	BL::Region region(regionptr);
 
 	PointerRNA v3dptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
+	RNA_pointer_create(NULL, &RNA_SpaceView3D, pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
 	BL::SpaceView3D v3d(v3dptr);
 
 	PointerRNA rv3dptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
+	RNA_pointer_create(NULL, &RNA_RegionView3D, pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
 	BL::RegionView3D rv3d(rv3dptr);
 
 	/* create session */
@@ -134,14 +164,14 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	return PyLong_FromVoidPtr(session);
 }
 
-static PyObject *free_func(PyObject *self, PyObject *value)
+static PyObject *free_func(PyObject * /*self*/, PyObject *value)
 {
 	delete (BlenderSession*)PyLong_AsVoidPtr(value);
 
 	Py_RETURN_NONE;
 }
 
-static PyObject *render_func(PyObject *self, PyObject *value)
+static PyObject *render_func(PyObject * /*self*/, PyObject *value)
 {
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(value);
 
@@ -155,14 +185,14 @@ static PyObject *render_func(PyObject *self, PyObject *value)
 }
 
 /* pixel_array and result passed as pointers */
-static PyObject *bake_func(PyObject *self, PyObject *args)
+static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pyobject;
 	PyObject *pypixel_array, *pyresult;
 	const char *pass_type;
-	int num_pixels, depth;
+	int num_pixels, depth, object_id;
 
-	if(!PyArg_ParseTuple(args, "OOsOiiO", &pysession, &pyobject, &pass_type, &pypixel_array,  &num_pixels, &depth, &pyresult))
+	if(!PyArg_ParseTuple(args, "OOsiOiiO", &pysession, &pyobject, &pass_type, &object_id, &pypixel_array, &num_pixels, &depth, &pyresult))
 		return NULL;
 
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
@@ -174,19 +204,19 @@ static PyObject *bake_func(PyObject *self, PyObject *args)
 	void *b_result = PyLong_AsVoidPtr(pyresult);
 
 	PointerRNA bakepixelptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
+	RNA_pointer_create(NULL, &RNA_BakePixel, PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
 	BL::BakePixel b_bake_pixel(bakepixelptr);
 
 	python_thread_state_save(&session->python_thread_state);
 
-	session->bake(b_object, pass_type, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result);
+	session->bake(b_object, pass_type, object_id, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result);
 
 	python_thread_state_restore(&session->python_thread_state);
 
 	Py_RETURN_NONE;
 }
 
-static PyObject *draw_func(PyObject *self, PyObject *args)
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pyv3d, *pyrv3d;
 
@@ -206,7 +236,7 @@ static PyObject *draw_func(PyObject *self, PyObject *args)
 	Py_RETURN_NONE;
 }
 
-static PyObject *reset_func(PyObject *self, PyObject *args)
+static PyObject *reset_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pydata, *pyscene;
 
@@ -216,7 +246,7 @@ static PyObject *reset_func(PyObject *self, PyObject *args)
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
 
 	PointerRNA dataptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pydata), &dataptr);
+	RNA_main_pointer_create((Main*)PyLong_AsVoidPtr(pydata), &dataptr);
 	BL::BlendData b_data(dataptr);
 
 	PointerRNA sceneptr;
@@ -232,7 +262,7 @@ static PyObject *reset_func(PyObject *self, PyObject *args)
 	Py_RETURN_NONE;
 }
 
-static PyObject *sync_func(PyObject *self, PyObject *value)
+static PyObject *sync_func(PyObject * /*self*/, PyObject *value)
 {
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(value);
 
@@ -245,7 +275,7 @@ static PyObject *sync_func(PyObject *self, PyObject *value)
 	Py_RETURN_NONE;
 }
 
-static PyObject *available_devices_func(PyObject *self, PyObject *args)
+static PyObject *available_devices_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
 	PyObject *ret = PyTuple_New(devices.size());
@@ -260,7 +290,7 @@ static PyObject *available_devices_func(PyObject *self, PyObject *args)
 
 #ifdef WITH_OSL
 
-static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
+static PyObject *osl_update_node_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pynodegroup, *pynode;
 	const char *filepath = NULL;
@@ -362,14 +392,8 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 
 		/* find socket socket */
 		BL::NodeSocket b_sock(PointerRNA_NULL);
-		if (param->isoutput) {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-			b_sock = b_node.outputs[param->name];
-#else
+		if(param->isoutput) {
 			b_sock = b_node.outputs[param->name.string()];
-#endif
-
-			
 			/* remove if type no longer matches */
 			if(b_sock && b_sock.bl_idname() != socket_type) {
 				b_node.outputs.remove(b_sock);
@@ -377,12 +401,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 			}
 		}
 		else {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-			b_sock = b_node.inputs[param->name];
-#else
 			b_sock = b_node.inputs[param->name.string()];
-#endif
-			
 			/* remove if type no longer matches */
 			if(b_sock && b_sock.bl_idname() != socket_type) {
 				b_node.inputs.remove(b_sock);
@@ -427,7 +446,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 
 		removed = false;
 
-		for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
+		for(b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
 			if(used_sockets.find(b_input->ptr.data) == used_sockets.end()) {
 				b_node.inputs.remove(*b_input);
 				removed = true;
@@ -435,7 +454,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 			}
 		}
 
-		for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
+		for(b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
 			if(used_sockets.find(b_output->ptr.data) == used_sockets.end()) {
 				b_node.outputs.remove(*b_output);
 				removed = true;
@@ -447,7 +466,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 	Py_RETURN_TRUE;
 }
 
-static PyObject *osl_compile_func(PyObject *self, PyObject *args)
+static PyObject *osl_compile_func(PyObject * /*self*/, PyObject *args)
 {
 	const char *inputfile = NULL, *outputfile = NULL;
 
@@ -462,6 +481,25 @@ static PyObject *osl_compile_func(PyObject *self, PyObject *args)
 }
 #endif
 
+static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
+{
+	string system_info = Device::device_capabilities();
+	return PyUnicode_FromString(system_info.c_str());
+}
+
+#ifdef WITH_OPENCL
+static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
+{
+	VLOG(2) << "Disabling OpenCL platform.";
+#ifdef WIN32
+	putenv("CYCLES_OPENCL_TEST=NONE");
+#else
+	setenv("CYCLES_OPENCL_TEST", "NONE", 1);
+#endif
+	Py_RETURN_NONE;
+}
+#endif
+
 static PyMethodDef methods[] = {
 	{"init", init_func, METH_VARARGS, ""},
 	{"create", create_func, METH_VARARGS, ""},
@@ -476,6 +514,10 @@ static PyMethodDef methods[] = {
 	{"osl_compile", osl_compile_func, METH_VARARGS, ""},
 #endif
 	{"available_devices", available_devices_func, METH_NOARGS, ""},
+	{"system_info", system_info_func, METH_NOARGS, ""},
+#ifdef WITH_OPENCL
+	{"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
+#endif
 	{NULL, NULL, 0, NULL},
 };
 
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 4ff3d89f9f1..04d05ee7b3c 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -32,6 +32,7 @@
 #include "util_color.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_progress.h"
 #include "util_time.h"
 
@@ -41,6 +42,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool BlenderSession::headless = false;
+
 BlenderSession::BlenderSession(BL::RenderEngine b_engine_, BL::UserPreferences b_userpref_,
 	BL::BlendData b_data_, BL::Scene b_scene_)
 : b_engine(b_engine_), b_userpref(b_userpref_), b_data(b_data_), b_render(b_engine_.render()), b_scene(b_scene_),
@@ -86,12 +89,14 @@ void BlenderSession::create()
 
 void BlenderSession::create_session()
 {
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+	bool is_cpu = session_params.device.type == DEVICE_CPU;
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	/* reset status/progress */
 	last_status = "";
+	last_error = "";
 	last_progress = -1.0f;
 	start_resize_time = 0.0;
 
@@ -111,13 +116,18 @@ void BlenderSession::create_session()
 	session->set_pause(session_pause);
 
 	/* create sync */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, session_params.device.type == DEVICE_CPU);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
 
 	if(b_v3d) {
 		if(session_pause == false) {
 			/* full data sync */
 			sync->sync_view(b_v3d, b_rv3d, width, height);
-			sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
+			sync->sync_data(b_render,
+			                b_v3d,
+			                b_engine.camera_override(),
+			                width, height,
+			                &python_thread_state,
+			                b_rlay_name.c_str());
 		}
 	}
 	else {
@@ -129,7 +139,7 @@ void BlenderSession::create_session()
 	}
 
 	/* set buffer parameters */
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 	session->reset(buffer_params, session_params.samples);
 
 	b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -141,8 +151,9 @@ void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_)
 	b_render = b_engine.render();
 	b_scene = b_scene_;
 
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+	const bool is_cpu = session_params.device.type == DEVICE_CPU;
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 
 	width = render_resolution_x(b_render);
 	height = render_resolution_y(b_render);
@@ -173,7 +184,7 @@ void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_)
 	session->stats.mem_peak = session->stats.mem_used;
 
 	/* sync object should be re-created */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, session_params.device.type == DEVICE_CPU);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
 
 	/* for final render we will do full data sync per render layer, only
 	 * do some basic syncing here, no objects or materials for speed */
@@ -181,7 +192,7 @@ void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_)
 	sync->sync_integrator();
 	sync->sync_camera(b_render, b_engine.camera_override(), width, height);
 
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, PointerRNA_NULL, PointerRNA_NULL, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, PointerRNA_NULL, PointerRNA_NULL, scene->camera, width, height);
 	session->reset(buffer_params, session_params.samples);
 
 	b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -261,6 +272,18 @@ static PassType get_pass_type(BL::RenderPass b_pass)
 		case BL::RenderPass::type_SPECULAR:
 		case BL::RenderPass::type_REFLECTION:
 			return PASS_NONE;
+#ifdef WITH_CYCLES_DEBUG
+		case BL::RenderPass::type_DEBUG:
+		{
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS)
+				return PASS_BVH_TRAVERSAL_STEPS;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES)
+				return PASS_BVH_TRAVERSED_INSTANCES;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_RAY_BOUNCES)
+				return PASS_RAY_BOUNCES;
+			break;
+		}
+#endif
 	}
 	
 	return PASS_NONE;
@@ -318,9 +341,9 @@ static ShaderEvalType get_shader_type(const string& pass_type)
 		return SHADER_EVAL_BAKE;
 }
 
-static BL::RenderResult begin_render_result(BL::RenderEngine b_engine, int x, int y, int w, int h, const char *layername)
+static BL::RenderResult begin_render_result(BL::RenderEngine b_engine, int x, int y, int w, int h, const char *layername, const char *viewname)
 {
-	return b_engine.begin_result(x, y, w, h, layername);
+	return b_engine.begin_result(x, y, w, h, layername, viewname);
 }
 
 static void end_render_result(BL::RenderEngine b_engine, BL::RenderResult b_rr, bool cancel, bool do_merge_results)
@@ -337,10 +360,10 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 	int h = params.height;
 
 	/* get render result */
-	BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str());
+	BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
 
 	/* can happen if the intersected rectangle gives 0 width or height */
-	if (b_rr.ptr.data == NULL) {
+	if(b_rr.ptr.data == NULL) {
 		return;
 	}
 
@@ -353,10 +376,10 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 
 	BL::RenderLayer b_rlay = *b_single_rlay;
 
-	if (do_update_only) {
+	if(do_update_only) {
 		/* update only needed */
 
-		if (rtile.sample != 0) {
+		if(rtile.sample != 0) {
 			/* sample would be zero at initial tile update, which is only needed
 			 * to tag tile form blender side as IN PROGRESS for proper highlight
 			 * no buffers should be sent to blender yet
@@ -384,7 +407,7 @@ void BlenderSession::update_render_tile(RenderTile& rtile)
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
-	if (!b_engine.is_preview())
+	if(!b_engine.is_preview())
 		do_write_update_render_tile(rtile, true);
 	else
 		do_write_update_render_tile(rtile, false);
@@ -398,17 +421,18 @@ void BlenderSession::render()
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
 	/* render each layer */
 	BL::RenderSettings r = b_scene.render();
-	BL::RenderSettings::layers_iterator b_iter;
+	BL::RenderSettings::layers_iterator b_layer_iter;
+	BL::RenderResult::views_iterator b_view_iter;
 	
-	for(r.layers.begin(b_iter); b_iter != r.layers.end(); ++b_iter) {
-		b_rlay_name = b_iter->name();
+	for(r.layers.begin(b_layer_iter); b_layer_iter != r.layers.end(); ++b_layer_iter) {
+		b_rlay_name = b_layer_iter->name();
 
-		/* temporary render result to find needed passes */
-		BL::RenderResult b_rr = begin_render_result(b_engine, 0, 0, 1, 1, b_rlay_name.c_str());
+		/* temporary render result to find needed passes and views */
+		BL::RenderResult b_rr = begin_render_result(b_engine, 0, 0, 1, 1, b_rlay_name.c_str(), NULL);
 		BL::RenderResult::layers_iterator b_single_rlay;
 		b_rr.layers.begin(b_single_rlay);
 
@@ -440,39 +464,59 @@ void BlenderSession::render()
 			}
 		}
 
-		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
-
 		buffer_params.passes = passes;
-		scene->film->pass_alpha_threshold = b_iter->pass_alpha_threshold();
+		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
 		scene->integrator->tag_update(scene);
 
-		/* update scene */
-		sync->sync_camera(b_render, b_engine.camera_override(), width, height);
-		sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state, b_rlay_name.c_str());
+		for(b_rr.views.begin(b_view_iter); b_view_iter != b_rr.views.end(); ++b_view_iter) {
+			b_rview_name = b_view_iter->name();
 
-		/* update number of samples per layer */
-		int samples = sync->get_layer_samples();
-		bool bound_samples = sync->get_layer_bound_samples();
+			/* set the current view */
+			b_engine.active_view_set(b_rview_name.c_str());
 
-		if(samples != 0 && (!bound_samples || (samples < session_params.samples)))
-			session->reset(buffer_params, samples);
-		else
-			session->reset(buffer_params, session_params.samples);
+			/* update scene */
+			sync->sync_camera(b_render, b_engine.camera_override(), width, height);
+			sync->sync_data(b_render,
+			                b_v3d,
+			                b_engine.camera_override(),
+			                width, height,
+			                &python_thread_state,
+			                b_rlay_name.c_str());
 
-		/* render */
-		session->start();
-		session->wait();
+			/* update number of samples per layer */
+			int samples = sync->get_layer_samples();
+			bool bound_samples = sync->get_layer_bound_samples();
+
+			if(samples != 0 && (!bound_samples || (samples < session_params.samples)))
+				session->reset(buffer_params, samples);
+			else
+				session->reset(buffer_params, session_params.samples);
+
+			/* render */
+			session->start();
+			session->wait();
+
+			if(session->progress.get_cancel())
+				break;
+		}
+
+		/* free result without merging */
+		end_render_result(b_engine, b_rr, true, false);
 
 		if(session->progress.get_cancel())
 			break;
 	}
 
+	double total_time, render_time;
+	session->progress.get_time(total_time, render_time);
+	VLOG(1) << "Total render time: " << total_time;
+	VLOG(1) << "Render time (without synchronization): " << render_time;
+
 	/* clear callback */
-	session->write_render_tile_cb = NULL;
-	session->update_render_tile_cb = NULL;
+	session->write_render_tile_cb = function_null;
+	session->update_render_tile_cb = function_null;
 
 	/* free all memory used (host and device), so we wouldn't leave render
 	 * engine with extra memory allocated
@@ -484,23 +528,32 @@ void BlenderSession::render()
 	sync = NULL;
 }
 
-static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const int num_pixels)
+static void populate_bake_data(BakeData *data, const int object_id, BL::BakePixel pixel_array, const int num_pixels)
 {
 	BL::BakePixel bp = pixel_array;
 
 	int i;
 	for(i=0; i < num_pixels; i++) {
-		data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
+		if(bp.object_id() == object_id) {
+			data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
+		} else {
+			data->set_null(i);
+		}
 		bp = bp.next();
 	}
 }
 
-void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float result[])
+void BlenderSession::bake(BL::Object b_object, const string& pass_type, const int object_id, BL::BakePixel pixel_array, const size_t num_pixels, const int /*depth*/, float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
 	size_t object_index = OBJECT_NONE;
 	int tri_offset = 0;
 
+	/* Set baking flag in advance, so kernel loading can check if we need
+	 * any baking capabilities.
+	 */
+	scene->bake_manager->set_baking(true);
+
 	/* ensure kernels are loaded before we do any scene updates */
 	session->load_kernels();
 
@@ -523,14 +576,18 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 
 	/* update scene */
 	sync->sync_camera(b_render, b_engine.camera_override(), width, height);
-	sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
+	sync->sync_data(b_render,
+	                b_v3d,
+	                b_engine.camera_override(),
+	                width, height,
+	                &python_thread_state,
+	                b_rlay_name.c_str());
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
 	scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
-	scene->bake_manager->set_baking(true);
 
 	/* set number of samples */
 	session->tile_manager.set_samples(session_params.samples);
@@ -551,7 +608,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 
 	BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
 
-	populate_bake_data(bake_data, pixel_array, num_pixels);
+	populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
 
 	/* set number of samples */
 	session->tile_manager.set_samples(session_params.samples);
@@ -585,7 +642,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult b_rr, BL::Re
 
 	vector<float> pixels(params.width*params.height*4);
 
-	if (!do_update_only) {
+	if(!do_update_only) {
 		/* copy each pass */
 		BL::RenderLayer::passes_iterator b_iter;
 
@@ -603,10 +660,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult b_rr, BL::Re
 			b_pass.rect(&pixels[0]);
 		}
 	}
-
-	/* copy combined pass */
-	if(buffers->get_pass_rect(PASS_COMBINED, exposure, rtile.sample, 4, &pixels[0]))
-		b_rlay.rect(&pixels[0]);
+	else {
+		/* copy combined pass */
+		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+		if(buffers->get_pass_rect(PASS_COMBINED, exposure, rtile.sample, 4, &pixels[0]))
+			b_combined_pass.rect(&pixels[0]);
+	}
 
 	/* tag result as updated */
 	b_engine.update_result(b_rr);
@@ -629,8 +688,9 @@ void BlenderSession::synchronize()
 		return;
 
 	/* on session/scene parameter changes, we recreate session entirely */
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+	const bool is_cpu = session_params.device.type == DEVICE_CPU;
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	if(session->params.modified(session_params) ||
@@ -663,7 +723,12 @@ void BlenderSession::synchronize()
 	}
 
 	/* data and camera synchronize */
-	sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
+	sync->sync_data(b_render,
+	                b_v3d,
+	                b_engine.camera_override(),
+	                width, height,
+	                &python_thread_state,
+	                b_rlay_name.c_str());
 
 	if(b_rv3d)
 		sync->sync_view(b_v3d, b_rv3d, width, height);
@@ -675,7 +740,7 @@ void BlenderSession::synchronize()
 
 	/* reset if needed */
 	if(scene->need_reset()) {
-		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 		session->reset(buffer_params, session_params.samples);
 
 		/* reset time */
@@ -730,7 +795,7 @@ bool BlenderSession::draw(int w, int h)
 		/* reset if requested */
 		if(reset) {
 			SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-			BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+			BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 			bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 			if(session_pause == false) {
@@ -747,7 +812,7 @@ bool BlenderSession::draw(int w, int h)
 	update_status_progress();
 
 	/* draw */
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 	DeviceDrawParams draw_params;
 
 	if(session->params.display_buffer_linear) {
@@ -763,19 +828,23 @@ void BlenderSession::get_status(string& status, string& substatus)
 	session->progress.get_status(status, substatus);
 }
 
-void BlenderSession::get_progress(float& progress, double& total_time)
+void BlenderSession::get_progress(float& progress, double& total_time, double& render_time)
 {
 	double tile_time;
 	int tile, sample, samples_per_tile;
 	int tile_total = session->tile_manager.state.num_tiles;
+	int samples = session->tile_manager.state.sample + 1;
+	int total_samples = session->tile_manager.num_samples;
 
-	session->progress.get_tile(tile, total_time, tile_time);
+	session->progress.get_tile(tile, total_time, render_time, tile_time);
 
 	sample = session->progress.get_sample();
 	samples_per_tile = session->tile_manager.num_samples;
 
-	if(samples_per_tile && tile_total)
+	if(background && samples_per_tile && tile_total)
 		progress = ((float)sample / (float)(tile_total * samples_per_tile));
+	else if(!background && samples > 0 && total_samples != USHRT_MAX)
+		progress = ((float)samples) / total_samples;
 	else
 		progress = 0.0;
 }
@@ -805,40 +874,36 @@ void BlenderSession::update_status_progress()
 	string timestatus, status, substatus;
 	string scene = "";
 	float progress;
-	double total_time, remaining_time = 0;
+	double total_time, remaining_time = 0, render_time;
 	char time_str[128];
 	float mem_used = (float)session->stats.mem_used / 1024.0f / 1024.0f;
 	float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
-	int samples = session->tile_manager.state.sample + 1;
-	int total_samples = session->tile_manager.num_samples;
 
 	get_status(status, substatus);
-	get_progress(progress, total_time);
+	get_progress(progress, total_time, render_time);
 
-	
+	if(progress > 0)
+		remaining_time = (1.0 - (double)progress) * (render_time / (double)progress);
 
 	if(background) {
-		if(progress>0)
-			remaining_time = (1-progress) * (total_time / progress);
-
 		scene += " | " + b_scene.name();
 		if(b_rlay_name != "")
 			scene += ", "  + b_rlay_name;
+
+		if(b_rview_name != "")
+			scene += ", " + b_rview_name;
 	}
 	else {
-		BLI_timestr(total_time, time_str, sizeof(time_str));
+		BLI_timecode_string_from_time_simple(time_str, sizeof(time_str), total_time);
 		timestatus = "Time:" + string(time_str) + " | ";
-
-		if(samples > 0 && total_samples != USHRT_MAX)
-			remaining_time = (total_samples - samples) * (total_time / samples);
 	}
-	
-	if(remaining_time>0) {
-		BLI_timestr(remaining_time, time_str, sizeof(time_str));
+
+	if(remaining_time > 0) {
+		BLI_timecode_string_from_time_simple(time_str, sizeof(time_str), remaining_time);
 		timestatus += "Remaining:" + string(time_str) + " | ";
 	}
-	
-	timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", mem_used, mem_peak);
+
+	timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", (double)mem_used, (double)mem_peak);
 
 	if(status.size() > 0)
 		status = " | " + status;
@@ -854,6 +919,21 @@ void BlenderSession::update_status_progress()
 		b_engine.update_progress(progress);
 		last_progress = progress;
 	}
+
+	if(session->progress.get_error()) {
+		string error = session->progress.get_error_message();
+		if(error != last_error) {
+			/* TODO(sergey): Currently C++ RNA API doesn't let us to
+			 * use mnemonic name for the variable. Would be nice to
+			 * have this figured out.
+			 *
+			 * For until then, 1 << 5 means RPT_ERROR.
+			 */
+			b_engine.report(1 << 5, error.c_str());
+			b_engine.error_set(error.c_str());
+			last_error = error;
+		}
+	}
 }
 
 void BlenderSession::tag_update()
@@ -952,6 +1032,18 @@ void BlenderSession::builtin_image_info(const string &builtin_name, void *builti
 
 		is_float = true;
 	}
+	else {
+		/* TODO(sergey): Check we're indeed in shader node tree. */
+		PointerRNA ptr;
+		RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
+		BL::Node b_node(ptr);
+		if(b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
+			BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
+			channels = 4;
+			width = height = depth = b_point_density_node.resolution();
+			is_float = true;
+		}
+	}
 }
 
 bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels)
@@ -971,18 +1063,19 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *buil
 
 	unsigned char *image_pixels;
 	image_pixels = image_get_pixels_for_frame(b_image, frame);
+	size_t num_pixels = ((size_t)width) * height;
 
 	if(image_pixels) {
-		memcpy(pixels, image_pixels, width * height * channels * sizeof(unsigned char));
+		memcpy(pixels, image_pixels, num_pixels * channels * sizeof(unsigned char));
 		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
-			memset(pixels, 0, width * height * sizeof(unsigned char));
+			memset(pixels, 0, num_pixels * sizeof(unsigned char));
 		}
 		else {
 			unsigned char *cp = pixels;
-			for(int i = 0; i < width * height; i++, cp += channels) {
+			for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 				cp[0] = 255;
 				cp[1] = 0;
 				cp[2] = 255;
@@ -994,7 +1087,7 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *buil
 
 	/* premultiply, byte images are always straight for blender */
 	unsigned char *cp = pixels;
-	for(int i = 0; i < width * height; i++, cp += channels) {
+	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 		cp[0] = (cp[0] * cp[3]) >> 8;
 		cp[1] = (cp[1] * cp[3]) >> 8;
 		cp[2] = (cp[2] * cp[3]) >> 8;
@@ -1023,18 +1116,19 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 
 		float *image_pixels;
 		image_pixels = image_get_float_pixels_for_frame(b_image, frame);
+		size_t num_pixels = ((size_t)width) * height;
 
 		if(image_pixels) {
-			memcpy(pixels, image_pixels, width * height * channels * sizeof(float));
+			memcpy(pixels, image_pixels, num_pixels * channels * sizeof(float));
 			MEM_freeN(image_pixels);
 		}
 		else {
 			if(channels == 1) {
-				memset(pixels, 0, width * height * sizeof(float));
+				memset(pixels, 0, num_pixels * sizeof(float));
 			}
 			else {
 				float *fp = pixels;
-				for(int i = 0; i < width * height; i++, fp += channels) {
+				for(int i = 0; i < num_pixels; i++, fp += channels) {
 					fp[0] = 1.0f;
 					fp[1] = 0.0f;
 					fp[2] = 1.0f;
@@ -1060,11 +1154,12 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 		int width = resolution.x * amplify;
 		int height = resolution.y * amplify;
 		int depth = resolution.z * amplify;
+		size_t num_pixels = ((size_t)width) * height * depth;
 
 		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
 			SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth) {
+			if(length == num_pixels) {
 				SmokeDomainSettings_density_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
@@ -1074,7 +1169,7 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 			 * as 1500..3000 K with the first part faded to zero density */
 			SmokeDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth) {
+			if(length == num_pixels) {
 				SmokeDomainSettings_flame_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
@@ -1083,7 +1178,7 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 			/* the RGB is "premultiplied" by density for better interpolation results */
 			SmokeDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth*4) {
+			if(length == num_pixels*4) {
 				SmokeDomainSettings_color_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
@@ -1091,6 +1186,17 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 
 		fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
 	}
+	else {
+		/* TODO(sergey): Check we're indeed in shader node tree. */
+		PointerRNA ptr;
+		RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
+		BL::Node b_node(ptr);
+		if(b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
+			BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
+			int length;
+			b_point_density_node.calc_point_density(b_scene, &length, &pixels);
+		}
+	}
 
 	return false;
 }
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index ac685118b3d..708776dc8ca 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BLENDER_SESSION_H__
@@ -52,7 +52,7 @@ public:
 	/* offline render */
 	void render();
 
-	void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]);
+	void bake(BL::Object b_object, const string& pass_type, const int object_id, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]);
 
 	void write_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile);
 	void write_render_tile(RenderTile& rtile);
@@ -70,12 +70,13 @@ public:
 	void tag_redraw();
 	void tag_update();
 	void get_status(string& status, string& substatus);
-	void get_progress(float& progress, double& total_time);
+	void get_progress(float& progress, double& total_time, double& render_time);
 	void test_cancel();
 	void update_status_progress();
 	void update_bake_progress();
 
 	bool background;
+	static bool headless;
 	Session *session;
 	Scene *scene;
 	BlenderSync *sync;
@@ -89,8 +90,10 @@ public:
 	BL::SpaceView3D b_v3d;
 	BL::RegionView3D b_rv3d;
 	string b_rlay_name;
+	string b_rview_name;
 
 	string last_status;
+	string last_error;
 	float last_progress;
 
 	int width, height;
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 33c7bf5f859..42aab52e294 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "background.h"
@@ -22,6 +22,7 @@
 #include "scene.h"
 #include "shader.h"
 
+#include "blender_texture.h"
 #include "blender_sync.h"
 #include "blender_util.h"
 
@@ -106,7 +107,7 @@ static ShaderSocketType convert_socket_type(BL::NodeSocket b_socket)
 	}
 }
 
-static void set_default_value(ShaderInput *input, BL::Node b_node, BL::NodeSocket b_sock, BL::BlendData b_data, BL::ID b_id)
+static void set_default_value(ShaderInput *input, BL::NodeSocket b_sock, BL::BlendData b_data, BL::ID b_id)
 {
 	/* copy values for non linked inputs */
 	switch(input->type) {
@@ -179,53 +180,59 @@ static bool is_output_node(BL::Node b_node)
 		    || b_node.is_a(&RNA_ShaderNodeOutputLamp));
 }
 
-static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree, BL::ShaderNode b_node)
+static ShaderNode *add_node(Scene *scene,
+                            BL::RenderEngine b_engine,
+                            BL::BlendData b_data,
+                            BL::Scene b_scene,
+                            ShaderGraph *graph,
+                            BL::ShaderNodeTree b_ntree,
+                            BL::ShaderNode b_node)
 {
 	ShaderNode *node = NULL;
 
 	/* existing blender nodes */
-	if (b_node.is_a(&RNA_ShaderNodeRGBCurve)) {
+	if(b_node.is_a(&RNA_ShaderNodeRGBCurve)) {
 		BL::ShaderNodeRGBCurve b_curve_node(b_node);
 		RGBCurvesNode *curves = new RGBCurvesNode();
 		curvemapping_color_to_array(b_curve_node.mapping(), curves->curves, RAMP_TABLE_SIZE, true);
 		node = curves;
 	}
-	if (b_node.is_a(&RNA_ShaderNodeVectorCurve)) {
+	if(b_node.is_a(&RNA_ShaderNodeVectorCurve)) {
 		BL::ShaderNodeVectorCurve b_curve_node(b_node);
 		VectorCurvesNode *curves = new VectorCurvesNode();
 		curvemapping_color_to_array(b_curve_node.mapping(), curves->curves, RAMP_TABLE_SIZE, false);
 		node = curves;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeValToRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeValToRGB)) {
 		RGBRampNode *ramp = new RGBRampNode();
 		BL::ShaderNodeValToRGB b_ramp_node(b_node);
 		colorramp_to_array(b_ramp_node.color_ramp(), ramp->ramp, RAMP_TABLE_SIZE);
 		ramp->interpolate = b_ramp_node.color_ramp().interpolation() != BL::ColorRamp::interpolation_CONSTANT;
 		node = ramp;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeRGB)) {
 		ColorNode *color = new ColorNode();
 		color->value = get_node_output_rgba(b_node, "Color");
 		node = color;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeValue)) {
+	else if(b_node.is_a(&RNA_ShaderNodeValue)) {
 		ValueNode *value = new ValueNode();
 		value->value = get_node_output_value(b_node, "Value");
 		node = value;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCameraData)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCameraData)) {
 		node = new CameraNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeInvert)) {
+	else if(b_node.is_a(&RNA_ShaderNodeInvert)) {
 		node = new InvertNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeGamma)) {
+	else if(b_node.is_a(&RNA_ShaderNodeGamma)) {
 		node = new GammaNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBrightContrast)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBrightContrast)) {
 		node = new BrightContrastNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMixRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMixRGB)) {
 		BL::ShaderNodeMixRGB b_mix_node(b_node);
 		MixNode *mix = new MixNode();
 		mix->type = MixNode::type_enum[b_mix_node.blend_type()];
@@ -236,44 +243,44 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		mix->use_clamp = b_mix_node.use_clamp();
 		node = mix;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateRGB)) {
 		node = new SeparateRGBNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineRGB)) {
 		node = new CombineRGBNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateHSV)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateHSV)) {
 		node = new SeparateHSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
 		node = new CombineHSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
 		node = new SeparateXYZNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineXYZ)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineXYZ)) {
 		node = new CombineXYZNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHueSaturation)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHueSaturation)) {
 		node = new HSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
+	else if(b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
 		node = new ConvertNode(SHADER_SOCKET_COLOR, SHADER_SOCKET_FLOAT);
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMath)) {
 		BL::ShaderNodeMath b_math_node(b_node);
 		MathNode *math = new MathNode();
 		math->type = MathNode::type_enum[b_math_node.operation()];
 		math->use_clamp = b_math_node.use_clamp();
 		node = math;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVectorMath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVectorMath)) {
 		BL::ShaderNodeVectorMath b_vector_math_node(b_node);
 		VectorMathNode *vmath = new VectorMathNode();
 		vmath->type = VectorMathNode::type_enum[b_vector_math_node.operation()];
 		node = vmath;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
 		BL::ShaderNodeVectorTransform b_vector_transform_node(b_node);
 		VectorTransformNode *vtransform = new VectorTransformNode();
 		vtransform->type = VectorTransformNode::type_enum[b_vector_transform_node.type()];
@@ -281,7 +288,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		vtransform->convert_to = VectorTransformNode::convert_space_enum[b_vector_transform_node.convert_to()];
 		node = vtransform;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNormal)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNormal)) {
 		BL::Node::outputs_iterator out_it;
 		b_node.outputs.begin(out_it);
 
@@ -289,7 +296,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		norm->direction = get_node_output_vector(b_node, "Normal");
 		node = norm;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMapping)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMapping)) {
 		BL::ShaderNodeMapping b_mapping_node(b_node);
 		MappingNode *mapping = new MappingNode();
 
@@ -297,31 +304,31 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = mapping;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeFresnel)) {
+	else if(b_node.is_a(&RNA_ShaderNodeFresnel)) {
 		node = new FresnelNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLayerWeight)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLayerWeight)) {
 		node = new LayerWeightNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAddShader)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAddShader)) {
 		node = new AddClosureNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMixShader)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMixShader)) {
 		node = new MixClosureNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAttribute)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAttribute)) {
 		BL::ShaderNodeAttribute b_attr_node(b_node);
 		AttributeNode *attr = new AttributeNode();
 		attr->attribute = b_attr_node.attribute_name();
 		node = attr;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBackground)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBackground)) {
 		node = new BackgroundNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHoldout)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHoldout)) {
 		node = new HoldoutNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) {
 		BL::ShaderNodeBsdfAnisotropic b_aniso_node(b_node);
 		AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode();
 
@@ -340,10 +347,10 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = aniso;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) {
 		node = new DiffuseBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSubsurfaceScattering)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSubsurfaceScattering)) {
 		BL::ShaderNodeSubsurfaceScattering b_subsurface_node(b_node);
 
 		SubsurfaceScatteringNode *subsurface = new SubsurfaceScatteringNode();
@@ -359,7 +366,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = subsurface;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfGlossy)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfGlossy)) {
 		BL::ShaderNodeBsdfGlossy b_glossy_node(b_node);
 		GlossyBsdfNode *glossy = new GlossyBsdfNode();
 		
@@ -379,7 +386,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = glossy;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfGlass)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfGlass)) {
 		BL::ShaderNodeBsdfGlass b_glass_node(b_node);
 		GlassBsdfNode *glass = new GlassBsdfNode();
 		switch(b_glass_node.distribution()) {
@@ -395,7 +402,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = glass;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfRefraction)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfRefraction)) {
 		BL::ShaderNodeBsdfRefraction b_refraction_node(b_node);
 		RefractionBsdfNode *refraction = new RefractionBsdfNode();
 		switch(b_refraction_node.distribution()) {
@@ -411,7 +418,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = refraction;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfToon)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfToon)) {
 		BL::ShaderNodeBsdfToon b_toon_node(b_node);
 		ToonBsdfNode *toon = new ToonBsdfNode();
 		switch(b_toon_node.component()) {
@@ -424,7 +431,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = toon;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfHair)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfHair)) {
 		BL::ShaderNodeBsdfHair b_hair_node(b_node);
 		HairBsdfNode *hair = new HairBsdfNode();
 		switch(b_hair_node.component()) {
@@ -437,64 +444,64 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = hair;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
 		node = new TranslucentBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfTransparent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfTransparent)) {
 		node = new TransparentBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfVelvet)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfVelvet)) {
 		node = new VelvetBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeEmission)) {
+	else if(b_node.is_a(&RNA_ShaderNodeEmission)) {
 		node = new EmissionNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAmbientOcclusion)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAmbientOcclusion)) {
 		node = new AmbientOcclusionNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVolumeScatter)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVolumeScatter)) {
 		node = new ScatterVolumeNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) {
 		node = new AbsorptionVolumeNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNewGeometry)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNewGeometry)) {
 		node = new GeometryNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeWireframe)) {
+	else if(b_node.is_a(&RNA_ShaderNodeWireframe)) {
 		BL::ShaderNodeWireframe b_wireframe_node(b_node);
 		WireframeNode *wire = new WireframeNode();
 		wire->use_pixel_size = b_wireframe_node.use_pixel_size();
 		node = wire;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeWavelength)) {
+	else if(b_node.is_a(&RNA_ShaderNodeWavelength)) {
 		node = new WavelengthNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBlackbody)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBlackbody)) {
 		node = new BlackbodyNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLightPath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLightPath)) {
 		node = new LightPathNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLightFalloff)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLightFalloff)) {
 		node = new LightFalloffNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeObjectInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeObjectInfo)) {
 		node = new ObjectInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeParticleInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeParticleInfo)) {
 		node = new ParticleInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHairInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHairInfo)) {
 		node = new HairInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBump)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBump)) {
 		BL::ShaderNodeBump b_bump_node(b_node);
 		BumpNode *bump = new BumpNode();
 		bump->invert = b_bump_node.invert();
 		node = bump;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeScript)) {
+	else if(b_node.is_a(&RNA_ShaderNodeScript)) {
 #ifdef WITH_OSL
 		if(scene->shader_manager->use_osl()) {
 			/* create script node */
@@ -510,16 +517,16 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			 * Socket names must be stored in the extra lists instead. */
 			BL::Node::inputs_iterator b_input;
 
-			for (b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
+			for(b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
 				script_node->input_names.push_back(ustring(b_input->name()));
 				ShaderInput *input = script_node->add_input(script_node->input_names.back().c_str(),
 				                                            convert_socket_type(*b_input));
-				set_default_value(input, b_node, *b_input, b_data, b_ntree);
+				set_default_value(input, *b_input, b_data, b_ntree);
 			}
 
 			BL::Node::outputs_iterator b_output;
 
-			for (b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
+			for(b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
 				script_node->output_names.push_back(ustring(b_output->name()));
 				script_node->add_output(script_node->output_names.back().c_str(),
 				                        convert_socket_type(*b_output));
@@ -543,9 +550,12 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 			node = script_node;
 		}
+#else
+		(void)b_data;
+		(void)b_ntree;
 #endif
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexImage)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexImage)) {
 		BL::ShaderNodeTexImage b_image_node(b_node);
 		BL::Image b_image(b_image_node.image());
 		ImageTextureNode *image = new ImageTextureNode();
@@ -555,7 +565,8 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			 */
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
-			                  b_image.source() == BL::Image::source_MOVIE;
+			                  b_image.source() == BL::Image::source_MOVIE ||
+			                  b_engine.is_preview();
 
 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -578,27 +589,31 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			image->use_alpha = b_image.use_alpha();
 
 			/* TODO(sergey): Does not work properly when we change builtin type. */
-			if (b_image.is_updated()) {
-				scene->image_manager->tag_reload_image(image->filename,
-				                                       image->builtin_data,
-				                                       (InterpolationType)b_image_node.interpolation());
+			if(b_image.is_updated()) {
+				scene->image_manager->tag_reload_image(
+				        image->filename,
+				        image->builtin_data,
+				        (InterpolationType)b_image_node.interpolation(),
+				        (ExtensionType)b_image_node.extension());
 			}
 		}
 		image->color_space = ImageTextureNode::color_space_enum[(int)b_image_node.color_space()];
 		image->projection = ImageTextureNode::projection_enum[(int)b_image_node.projection()];
 		image->interpolation = (InterpolationType)b_image_node.interpolation();
+		image->extension = (ExtensionType)b_image_node.extension();
 		image->projection_blend = b_image_node.projection_blend();
 		get_tex_mapping(&image->tex_mapping, b_image_node.texture_mapping());
 		node = image;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
 		BL::ShaderNodeTexEnvironment b_env_node(b_node);
 		BL::Image b_image(b_env_node.image());
 		EnvironmentTextureNode *env = new EnvironmentTextureNode();
 		if(b_image) {
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
-			                  b_image.source() == BL::Image::source_MOVIE;
+			                  b_image.source() == BL::Image::source_MOVIE ||
+			                  b_engine.is_preview();
 
 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -615,10 +630,11 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			env->use_alpha = b_image.use_alpha();
 
 			/* TODO(sergey): Does not work properly when we change builtin type. */
-			if (b_image.is_updated()) {
+			if(b_image.is_updated()) {
 				scene->image_manager->tag_reload_image(env->filename,
 				                                       env->builtin_data,
-				                                       INTERPOLATION_LINEAR);
+				                                       INTERPOLATION_LINEAR,
+				                                       EXTENSION_REPEAT);
 			}
 		}
 		env->color_space = EnvironmentTextureNode::color_space_enum[(int)b_env_node.color_space()];
@@ -626,41 +642,41 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&env->tex_mapping, b_env_node.texture_mapping());
 		node = env;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexGradient)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexGradient)) {
 		BL::ShaderNodeTexGradient b_gradient_node(b_node);
 		GradientTextureNode *gradient = new GradientTextureNode();
 		gradient->type = GradientTextureNode::type_enum[(int)b_gradient_node.gradient_type()];
 		get_tex_mapping(&gradient->tex_mapping, b_gradient_node.texture_mapping());
 		node = gradient;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
 		BL::ShaderNodeTexVoronoi b_voronoi_node(b_node);
 		VoronoiTextureNode *voronoi = new VoronoiTextureNode();
 		voronoi->coloring = VoronoiTextureNode::coloring_enum[(int)b_voronoi_node.coloring()];
 		get_tex_mapping(&voronoi->tex_mapping, b_voronoi_node.texture_mapping());
 		node = voronoi;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexMagic)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexMagic)) {
 		BL::ShaderNodeTexMagic b_magic_node(b_node);
 		MagicTextureNode *magic = new MagicTextureNode();
 		magic->depth = b_magic_node.turbulence_depth();
 		get_tex_mapping(&magic->tex_mapping, b_magic_node.texture_mapping());
 		node = magic;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexWave)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexWave)) {
 		BL::ShaderNodeTexWave b_wave_node(b_node);
 		WaveTextureNode *wave = new WaveTextureNode();
 		wave->type = WaveTextureNode::type_enum[(int)b_wave_node.wave_type()];
 		get_tex_mapping(&wave->tex_mapping, b_wave_node.texture_mapping());
 		node = wave;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexChecker)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexChecker)) {
 		BL::ShaderNodeTexChecker b_checker_node(b_node);
 		CheckerTextureNode *checker = new CheckerTextureNode();
 		get_tex_mapping(&checker->tex_mapping, b_checker_node.texture_mapping());
 		node = checker;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexBrick)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexBrick)) {
 		BL::ShaderNodeTexBrick b_brick_node(b_node);
 		BrickTextureNode *brick = new BrickTextureNode();
 		brick->offset = b_brick_node.offset();
@@ -670,26 +686,30 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&brick->tex_mapping, b_brick_node.texture_mapping());
 		node = brick;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexNoise)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexNoise)) {
 		BL::ShaderNodeTexNoise b_noise_node(b_node);
 		NoiseTextureNode *noise = new NoiseTextureNode();
 		get_tex_mapping(&noise->tex_mapping, b_noise_node.texture_mapping());
 		node = noise;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
 		BL::ShaderNodeTexMusgrave b_musgrave_node(b_node);
 		MusgraveTextureNode *musgrave = new MusgraveTextureNode();
 		musgrave->type = MusgraveTextureNode::type_enum[(int)b_musgrave_node.musgrave_type()];
 		get_tex_mapping(&musgrave->tex_mapping, b_musgrave_node.texture_mapping());
 		node = musgrave;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexCoord)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexCoord)) {
 		BL::ShaderNodeTexCoord b_tex_coord_node(b_node);
 		TextureCoordinateNode *tex_coord = new TextureCoordinateNode();
 		tex_coord->from_dupli = b_tex_coord_node.from_dupli();
+		if(b_tex_coord_node.object()) {
+			tex_coord->use_transform = true;
+			tex_coord->ob_tfm = get_transform(b_tex_coord_node.object().matrix_world());
+		}
 		node = tex_coord;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexSky)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexSky)) {
 		BL::ShaderNodeTexSky b_sky_node(b_node);
 		SkyTextureNode *sky = new SkyTextureNode();
 		sky->type = SkyTextureNode::type_enum[(int)b_sky_node.sky_type()];
@@ -699,14 +719,14 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&sky->tex_mapping, b_sky_node.texture_mapping());
 		node = sky;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNormalMap)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNormalMap)) {
 		BL::ShaderNodeNormalMap b_normal_map_node(b_node);
 		NormalMapNode *nmap = new NormalMapNode();
 		nmap->space = NormalMapNode::space_enum[(int)b_normal_map_node.space()];
 		nmap->attribute = b_normal_map_node.uv_map();
 		node = nmap;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTangent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTangent)) {
 		BL::ShaderNodeTangent b_tangent_node(b_node);
 		TangentNode *tangent = new TangentNode();
 		tangent->direction_type = TangentNode::direction_type_enum[(int)b_tangent_node.direction_type()];
@@ -714,13 +734,43 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		tangent->attribute = b_tangent_node.uv_map();
 		node = tangent;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeUVMap)) {
+	else if(b_node.is_a(&RNA_ShaderNodeUVMap)) {
 		BL::ShaderNodeUVMap b_uvmap_node(b_node);
 		UVMapNode *uvm = new UVMapNode();
 		uvm->attribute = b_uvmap_node.uv_map();
 		uvm->from_dupli = b_uvmap_node.from_dupli();
 		node = uvm;
 	}
+	else if(b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
+		BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
+		PointDensityTextureNode *point_density = new PointDensityTextureNode();
+		point_density->filename = b_point_density_node.name();
+		point_density->space =
+		        PointDensityTextureNode::space_enum[(int)b_point_density_node.space()];
+		point_density->interpolation =
+		        (InterpolationType)b_point_density_node.interpolation();
+		point_density->builtin_data = b_point_density_node.ptr.data;
+
+		/* Transformation form world space to texture space. */
+		BL::Object b_ob(b_point_density_node.object());
+		if(b_ob) {
+			float3 loc, size;
+			point_density_texture_space(b_point_density_node, loc, size);
+			point_density->tfm =
+			        transform_translate(-loc) * transform_scale(size) *
+			        transform_inverse(get_transform(b_ob.matrix_world()));
+		}
+
+		/* TODO(sergey): Use more proper update flag. */
+		if(true) {
+			scene->image_manager->tag_reload_image(
+			        point_density->filename,
+			        point_density->builtin_data,
+			        point_density->interpolation,
+			        EXTENSION_REPEAT);
+		}
+		node = point_density;
+	}
 
 	if(node)
 		graph->add(node);
@@ -730,7 +780,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 static bool node_use_modified_socket_name(ShaderNode *node)
 {
-	if (node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
+	if(node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
 		return false;
 
 	return true;
@@ -740,14 +790,14 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B
 {
 	string name = b_socket.name();
 	
-	if (node_use_modified_socket_name(node)) {
+	if(node_use_modified_socket_name(node)) {
 		BL::Node::inputs_iterator b_input;
 		bool found = false;
 		int counter = 0, total = 0;
 
-		for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
-			if (b_input->name() == name) {
-				if (!found)
+		for(b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
+			if(b_input->name() == name) {
+				if(!found)
 					counter++;
 				total++;
 			}
@@ -757,10 +807,10 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B
 		}
 
 		/* rename if needed */
-		if (name == "Shader")
+		if(name == "Shader")
 			name = "Closure";
 
-		if (total > 1)
+		if(total > 1)
 			name = string_printf("%s%d", name.c_str(), counter);
 	}
 
@@ -771,14 +821,14 @@ static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node,
 {
 	string name = b_socket.name();
 
-	if (node_use_modified_socket_name(node)) {
+	if(node_use_modified_socket_name(node)) {
 		BL::Node::outputs_iterator b_output;
 		bool found = false;
 		int counter = 0, total = 0;
 
-		for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
-			if (b_output->name() == name) {
-				if (!found)
+		for(b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
+			if(b_output->name() == name) {
+				if(!found)
 					counter++;
 				total++;
 			}
@@ -788,18 +838,24 @@ static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node,
 		}
 
 		/* rename if needed */
-		if (name == "Shader")
+		if(name == "Shader")
 			name = "Closure";
 
-		if (total > 1)
+		if(total > 1)
 			name = string_printf("%s%d", name.c_str(), counter);
 	}
 
 	return node->output(name.c_str());
 }
 
-static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree,
-                      const ProxyMap &proxy_input_map, const ProxyMap &proxy_output_map)
+static void add_nodes(Scene *scene,
+                      BL::RenderEngine b_engine,
+                      BL::BlendData b_data,
+                      BL::Scene b_scene,
+                      ShaderGraph *graph,
+                      BL::ShaderNodeTree b_ntree,
+                      const ProxyMap &proxy_input_map,
+                      const ProxyMap &proxy_output_map)
 {
 	/* add nodes */
 	BL::ShaderNodeTree::nodes_iterator b_node;
@@ -814,7 +870,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	BL::ShaderNode output_node(PointerRNA_NULL);
 
 	for(b_ntree.nodes.begin(b_node); b_node != b_ntree.nodes.end(); ++b_node) {
-		if (is_output_node(*b_node)) {
+		if(is_output_node(*b_node)) {
 			BL::ShaderNodeOutputMaterial b_output_node(*b_node);
 
 			if(b_output_node.is_active_output()) {
@@ -830,10 +886,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 
 	/* add nodes */
 	for(b_ntree.nodes.begin(b_node); b_node != b_ntree.nodes.end(); ++b_node) {
-		if (b_node->mute() || b_node->is_a(&RNA_NodeReroute)) {
+		if(b_node->mute() || b_node->is_a(&RNA_NodeReroute)) {
 			/* replace muted node with internal links */
 			BL::Node::internal_links_iterator b_link;
-			for (b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
+			for(b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
 				ProxyNode *proxy = new ProxyNode(convert_socket_type(b_link->to_socket()));
 
 				input_map[b_link->from_socket().ptr.data] = proxy->inputs[0];
@@ -842,10 +898,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 				graph->add(proxy);
 			}
 		}
-		else if (b_node->is_a(&RNA_ShaderNodeGroup) || b_node->is_a(&RNA_NodeCustomGroup)) {
+		else if(b_node->is_a(&RNA_ShaderNodeGroup) || b_node->is_a(&RNA_NodeCustomGroup)) {
 			
 			BL::ShaderNodeTree b_group_ntree(PointerRNA_NULL);
-			if (b_node->is_a(&RNA_ShaderNodeGroup))
+			if(b_node->is_a(&RNA_ShaderNodeGroup))
 				b_group_ntree = BL::ShaderNodeTree(((BL::NodeGroup)(*b_node)).node_tree());
 			else
 				b_group_ntree = BL::ShaderNodeTree(((BL::NodeCustomGroup)(*b_node)).node_tree());
@@ -864,7 +920,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 
 				input_map[b_input->ptr.data] = proxy->inputs[0];
 
-				set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
+				set_default_value(proxy->inputs[0], *b_input, b_data, b_ntree);
 			}
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_output));
@@ -876,33 +932,41 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 				output_map[b_output->ptr.data] = proxy->outputs[0];
 			}
 			
-			if (b_group_ntree)
-				add_nodes(scene, b_data, b_scene, graph, b_group_ntree, group_proxy_input_map, group_proxy_output_map);
+			if(b_group_ntree) {
+				add_nodes(scene,
+				          b_engine,
+				          b_data,
+				          b_scene,
+				          graph,
+				          b_group_ntree,
+				          group_proxy_input_map,
+				          group_proxy_output_map);
+			}
 		}
-		else if (b_node->is_a(&RNA_NodeGroupInput)) {
+		else if(b_node->is_a(&RNA_NodeGroupInput)) {
 			/* map each socket to a proxy node */
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				ProxyMap::const_iterator proxy_it = proxy_input_map.find(b_output->identifier());
-				if (proxy_it != proxy_input_map.end()) {
+				if(proxy_it != proxy_input_map.end()) {
 					ProxyNode *proxy = proxy_it->second;
 
 					output_map[b_output->ptr.data] = proxy->outputs[0];
 				}
 			}
 		}
-		else if (b_node->is_a(&RNA_NodeGroupOutput)) {
+		else if(b_node->is_a(&RNA_NodeGroupOutput)) {
 			BL::NodeGroupOutput b_output_node(*b_node);
 			/* only the active group output is used */
-			if (b_output_node.is_active_output()) {
+			if(b_output_node.is_active_output()) {
 				/* map each socket to a proxy node */
 				for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 					ProxyMap::const_iterator proxy_it = proxy_output_map.find(b_input->identifier());
-					if (proxy_it != proxy_output_map.end()) {
+					if(proxy_it != proxy_output_map.end()) {
 						ProxyNode *proxy = proxy_it->second;
 
 						input_map[b_input->ptr.data] = proxy->inputs[0];
 
-						set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
+						set_default_value(proxy->inputs[0], *b_input, b_data, b_ntree);
 					}
 				}
 			}
@@ -910,30 +974,36 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 		else {
 			ShaderNode *node = NULL;
 
-			if (is_output_node(*b_node)) {
-				if (b_node->ptr.data == output_node.ptr.data) {
+			if(is_output_node(*b_node)) {
+				if(b_node->ptr.data == output_node.ptr.data) {
 					node = graph->output();
 				}
 			}
 			else {
-				node = add_node(scene, b_data, b_scene, graph, b_ntree, BL::ShaderNode(*b_node));
+				node = add_node(scene,
+				                b_engine,
+				                b_data,
+				                b_scene,
+				                graph,
+				                b_ntree,
+				                BL::ShaderNode(*b_node));
 			}
 
 			if(node) {
 				/* map node sockets for linking */
 				for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 					ShaderInput *input = node_find_input_by_name(node, *b_node, *b_input);
-					if (!input) {
+					if(!input) {
 						/* XXX should not happen, report error? */
 						continue;
 					}
 					input_map[b_input->ptr.data] = input;
 
-					set_default_value(input, *b_node, *b_input, b_data, b_ntree);
+					set_default_value(input, *b_input, b_data, b_ntree);
 				}
 				for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 					ShaderOutput *output = node_find_output_by_name(node, *b_node, *b_output);
-					if (!output) {
+					if(!output) {
 						/* XXX should not happen, report error? */
 						continue;
 					}
@@ -947,6 +1017,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	BL::NodeTree::links_iterator b_link;
 
 	for(b_ntree.links.begin(b_link); b_link != b_ntree.links.end(); ++b_link) {
+		/* Ignore invalid links to avoid unwanted cycles created in graph. */
+		if(!b_link->is_valid()) {
+			continue;
+		}
 		/* get blender link data */
 		BL::NodeSocket b_from_sock = b_link->from_socket();
 		BL::NodeSocket b_to_sock = b_link->to_socket();
@@ -955,10 +1029,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 		ShaderInput *input = 0;
 
 		PtrOutputMap::iterator output_it = output_map.find(b_from_sock.ptr.data);
-		if (output_it != output_map.end())
+		if(output_it != output_map.end())
 			output = output_it->second;
 		PtrInputMap::iterator input_it = input_map.find(b_to_sock.ptr.data);
-		if (input_it != input_map.end())
+		if(input_it != input_map.end())
 			input = input_it->second;
 
 		/* either node may be NULL when the node was not exported, typically
@@ -968,10 +1042,22 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	}
 }
 
-static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree)
+static void add_nodes(Scene *scene,
+                      BL::RenderEngine b_engine,
+                      BL::BlendData b_data,
+                      BL::Scene b_scene,
+                      ShaderGraph *graph,
+                      BL::ShaderNodeTree b_ntree)
 {
 	static const ProxyMap empty_proxy_map;
-	add_nodes(scene, b_data, b_scene, graph, b_ntree, empty_proxy_map, empty_proxy_map);
+	add_nodes(scene,
+	          b_engine,
+	          b_data,
+	          b_scene,
+	          graph,
+	          b_ntree,
+	          empty_proxy_map,
+	          empty_proxy_map);
 }
 
 /* Sync Materials */
@@ -997,7 +1083,7 @@ void BlenderSync::sync_materials(bool update_all)
 			if(b_mat->use_nodes() && b_mat->node_tree()) {
 				BL::ShaderNodeTree b_ntree(b_mat->node_tree());
 
-				add_nodes(scene, b_data, b_scene, graph, b_ntree);
+				add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 			}
 			else {
 				ShaderNode *closure, *out;
@@ -1014,7 +1100,8 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->use_mis = get_boolean(cmat, "sample_as_light");
 			shader->use_transparent_shadow = get_boolean(cmat, "use_transparent_shadow");
 			shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
-			shader->volume_sampling_method = RNA_enum_get(&cmat, "volume_sampling");
+			shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cmat, "volume_sampling");
+			shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cmat, "volume_interpolation");
 
 			shader->set_graph(graph);
 			shader->tag_update(scene);
@@ -1039,12 +1126,13 @@ void BlenderSync::sync_world(bool update_all)
 		if(b_world && b_world.use_nodes() && b_world.node_tree()) {
 			BL::ShaderNodeTree b_ntree(b_world.node_tree());
 
-			add_nodes(scene, b_data, b_scene, graph, b_ntree);
+			add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 
 			/* volume */
 			PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 			shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume");
-			shader->volume_sampling_method = RNA_enum_get(&cworld, "volume_sampling");
+			shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cworld, "volume_sampling");
+			shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cworld, "volume_interpolation");
 		}
 		else if(b_world) {
 			ShaderNode *closure, *out;
@@ -1088,7 +1176,7 @@ void BlenderSync::sync_world(bool update_all)
 	PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
 	/* when doing preview render check for BI's transparency settings,
-	 * this is so because bledner's preview render routines are not able
+	 * this is so because Blender's preview render routines are not able
 	 * to tweak all cycles's settings depending on different circumstances
 	 */
 	if(b_engine.is_preview() == false)
@@ -1124,7 +1212,7 @@ void BlenderSync::sync_lamps(bool update_all)
 
 				BL::ShaderNodeTree b_ntree(b_lamp->node_tree());
 
-				add_nodes(scene, b_data, b_scene, graph, b_ntree);
+				add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 			}
 			else {
 				ShaderNode *closure, *out;
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 2ac90b34fd7..aed1b6de138 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "background.h"
@@ -30,11 +30,13 @@
 #include "device.h"
 
 #include "blender_sync.h"
+#include "blender_session.h"
 #include "blender_util.h"
 
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_opengl.h"
+#include "util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -70,10 +72,18 @@ bool BlenderSync::sync_recalc()
 	 * so we can do it later on if doing it immediate is not suitable */
 
 	BL::BlendData::materials_iterator b_mat;
-
-	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat)
-		if(b_mat->is_updated() || (b_mat->node_tree() && b_mat->node_tree().is_updated()))
+	bool has_updated_objects = b_data.objects.is_updated();
+	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
+		if(b_mat->is_updated() || (b_mat->node_tree() && b_mat->node_tree().is_updated())) {
 			shader_map.set_recalc(*b_mat);
+		}
+		else {
+			Shader *shader = shader_map.find(*b_mat);
+			if(has_updated_objects && shader != NULL && shader->has_object_dependency) {
+				shader_map.set_recalc(*b_mat);
+			}
+		}
+	}
 
 	BL::BlendData::lamps_iterator b_lamp;
 
@@ -102,7 +112,7 @@ bool BlenderSync::sync_recalc()
 		
 		if(b_ob->is_updated_data()) {
 			BL::Object::particle_systems_iterator b_psys;
-			for (b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end(); ++b_psys)
+			for(b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end(); ++b_psys)
 				particle_system_map.set_recalc(*b_ob);
 		}
 	}
@@ -135,18 +145,28 @@ bool BlenderSync::sync_recalc()
 	return recalc;
 }
 
-void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, void **python_thread_state, const char *layer)
+void BlenderSync::sync_data(BL::RenderSettings b_render,
+                            BL::SpaceView3D b_v3d,
+                            BL::Object b_override,
+                            int width, int height,
+                            void **python_thread_state,
+                            const char *layer)
 {
 	sync_render_layers(b_v3d, layer);
 	sync_integrator();
 	sync_film();
 	sync_shaders();
+	sync_images();
 	sync_curve_settings();
 
 	mesh_synced.clear(); /* use for objects and motion sync */
 
 	sync_objects(b_v3d);
-	sync_motion(b_v3d, b_override, python_thread_state);
+	sync_motion(b_render,
+	            b_v3d,
+	            b_override,
+	            width, height,
+	            python_thread_state);
 
 	mesh_synced.clear();
 }
@@ -185,6 +205,9 @@ void BlenderSync::sync_integrator()
 	integrator->filter_glossy = get_float(cscene, "blur_glossy");
 
 	integrator->seed = get_int(cscene, "seed");
+	if(get_boolean(cscene, "use_animated_seed"))
+		integrator->seed = hash_int_2d(b_scene.frame_current(), get_int(cscene, "seed"));
+
 	integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern");
 
 	integrator->layer_flag = render_layer.layer;
@@ -351,9 +374,42 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
 	}
 }
 
+/* Images */
+void BlenderSync::sync_images()
+{
+	/* Sync is a convention for this API, but currently it frees unused buffers. */
+
+	const bool is_interface_locked = b_engine.render() &&
+	                                 b_engine.render().use_lock_interface();
+	if(is_interface_locked == false && BlenderSession::headless == false) {
+		/* If interface is not locked, it's possible image is needed for
+		 * the display.
+		 */
+		return;
+	}
+	/* Free buffers used by images which are not needed for render. */
+	BL::BlendData::images_iterator b_image;
+	for(b_data.images.begin(b_image);
+	    b_image != b_data.images.end();
+	    ++b_image)
+	{
+		/* TODO(sergey): Consider making it an utility function to check
+		 * whether image is considered builtin.
+		 */
+		const bool is_builtin = b_image->packed_file() ||
+		                        b_image->source() == BL::Image::source_GENERATED ||
+		                        b_image->source() == BL::Image::source_MOVIE ||
+		                        b_engine.is_preview();
+		if(is_builtin == false) {
+			b_image->buffers_free();
+		}
+		/* TODO(sergey): Free builtin images not used by any shader. */
+	}
+}
+
 /* Scene Parameters */
 
-SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background)
+SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background, bool is_cpu)
 {
 	BL::RenderSettings r = b_scene.render();
 	SceneParams params;
@@ -371,13 +427,22 @@ SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background)
 		params.bvh_type = (SceneParams::BVHType)RNA_enum_get(&cscene, "debug_bvh_type");
 
 	params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
-	params.use_bvh_cache = (background)? RNA_boolean_get(&cscene, "use_cache"): false;
 
 	if(background && params.shadingsystem != SHADINGSYSTEM_OSL)
 		params.persistent_data = r.use_persistent_data();
 	else
 		params.persistent_data = false;
 
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+	if(is_cpu) {
+		params.use_qbvh = system_cpu_support_sse2();
+	}
+	else
+#endif
+	{
+		params.use_qbvh = false;
+	}
+
 	return params;
 }
 
@@ -389,7 +454,10 @@ bool BlenderSync::get_session_pause(BL::Scene b_scene, bool background)
 	return (background)? false: get_boolean(cscene, "preview_pause");
 }
 
-SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::UserPreferences b_userpref, BL::Scene b_scene, bool background)
+SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine,
+                                              BL::UserPreferences b_userpref,
+                                              BL::Scene b_scene,
+                                              bool background)
 {
 	SessionParams params;
 	PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -478,8 +546,13 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
 
 		params.tile_size = make_int2(tile_x, tile_y);
 	}
-	
-	params.tile_order = (TileOrder)RNA_enum_get(&cscene, "tile_order");
+
+	if(BlenderSession::headless == false) {
+		params.tile_order = (TileOrder)RNA_enum_get(&cscene, "tile_order");
+	}
+	else {
+		params.tile_order = TILE_BOTTOM_TO_TOP;
+	}
 
 	params.start_resolution = get_int(cscene, "preview_start_resolution");
 
@@ -515,7 +588,24 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
 		params.shadingsystem = SHADINGSYSTEM_OSL;
 	
 	/* color managagement */
-	params.display_buffer_linear = GLEW_ARB_half_float_pixel && b_engine.support_display_space_shader(b_scene);
+#ifdef GLEW_MX
+	/* When using GLEW MX we need to check whether we've got an OpenGL
+	 * context for current window. This is because command line rendering
+	 * doesn't have OpenGL context actually.
+	 */
+	if(glewGetContext() != NULL)
+#endif
+	{
+		params.display_buffer_linear = GLEW_ARB_half_float_pixel &&
+		                               b_engine.support_display_space_shader(b_scene);
+	}
+
+	if(b_engine.is_preview()) {
+		/* For preview rendering we're using same timeout as
+		 * blender's job update.
+		 */
+		params.progressive_update_timeout = 0.1;
+	}
 
 	return params;
 }
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index bb8e9c31b1d..e99a64bd23e 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BLENDER_SYNC_H__
@@ -53,7 +53,12 @@ public:
 
 	/* sync */
 	bool sync_recalc();
-	void sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, void **python_thread_state, const char *layer = 0);
+	void sync_data(BL::RenderSettings b_render,
+	               BL::SpaceView3D b_v3d,
+	               BL::Object b_override,
+	               int width, int height,
+	               void **python_thread_state,
+	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D b_v3d, const char *layer);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings b_render, BL::Object b_override, int width, int height);
@@ -64,17 +69,24 @@ public:
 	int get_layer_bound_samples() { return render_layer.bound_samples; }
 
 	/* get parameters */
-	static SceneParams get_scene_params(BL::Scene b_scene, bool background);
-	static SessionParams get_session_params(BL::RenderEngine b_engine, BL::UserPreferences b_userpref, BL::Scene b_scene, bool background);
+	static SceneParams get_scene_params(BL::Scene b_scene, bool background, bool is_cpu);
+	static SessionParams get_session_params(BL::RenderEngine b_engine,
+	                                        BL::UserPreferences b_userpref,
+	                                        BL::Scene b_scene,
+	                                        bool background);
 	static bool get_session_pause(BL::Scene b_scene, bool background);
-	static BufferParams get_buffer_params(BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height);
+	static BufferParams get_buffer_params(BL::RenderSettings b_render, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height);
 
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
 	void sync_materials(bool update_all);
 	void sync_objects(BL::SpaceView3D b_v3d, float motion_time = 0.0f);
-	void sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void **python_thread_state);
+	void sync_motion(BL::RenderSettings b_render,
+	                 BL::SpaceView3D b_v3d,
+	                 BL::Object b_override,
+	                 int width, int height,
+	                 void **python_thread_state);
 	void sync_film();
 	void sync_view();
 	void sync_world(bool update_all);
@@ -84,16 +96,30 @@ private:
 	void sync_nodes(Shader *shader, BL::ShaderNodeTree b_ntree);
 	Mesh *sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tris);
 	void sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool motion, int time_index = 0);
-	Object *sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
-	                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris);
-	void sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm);
-	void sync_background_light();
+	Object *sync_object(BL::Object b_parent,
+	                    int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
+	                    BL::DupliObject b_dupli_ob,
+	                    Transform& tfm,
+	                    uint layer_flag,
+	                    float motion_time,
+	                    bool hide_tris,
+	                    bool use_camera_cull,
+	                    float camera_cull_margin,
+	                    bool *use_portal);
+	void sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm, bool *use_portal);
+	void sync_background_light(bool use_portal);
 	void sync_mesh_motion(BL::Object b_ob, Object *object, float motion_time);
-	void sync_camera_motion(BL::Object b_ob, float motion_time);
+	void sync_camera_motion(BL::RenderSettings b_render,
+	                        BL::Object b_ob,
+	                        int width, int height,
+	                        float motion_time);
 
 	/* particles */
 	bool sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Object *object);
 
+	/* Images. */
+	void sync_images();
+
 	/* util */
 	void find_shader(BL::ID id, vector<uint>& used_shaders, int default_shader);
 	bool BKE_object_is_modified(BL::Object b_ob);
diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp
new file mode 100644
index 00000000000..cb4dd1792d0
--- /dev/null
+++ b/intern/cycles/blender/blender_texture.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender_texture.h"
+
+CCL_NAMESPACE_BEGIN
+
+namespace {
+
+/* Point density helpers. */
+
+static void density_texture_space_invert(float3& loc,
+                                         float3& size)
+{
+	if(size.x != 0.0f) size.x = 0.5f/size.x;
+	if(size.y != 0.0f) size.y = 0.5f/size.y;
+	if(size.z != 0.0f) size.z = 0.5f/size.z;
+
+	loc = loc*size - make_float3(0.5f, 0.5f, 0.5f);
+}
+
+static void density_object_texture_space(BL::Object b_ob,
+                                         float radius,
+                                         float3& loc,
+                                         float3& size)
+{
+	if(b_ob.type() == BL::Object::type_MESH) {
+		BL::Mesh b_mesh(b_ob.data());
+		loc = get_float3(b_mesh.texspace_location());
+		size = get_float3(b_mesh.texspace_size());
+	}
+	else {
+		/* TODO(sergey): Not supported currently. */
+	}
+	/* Adjust texture space to include density points on the boundaries. */
+	size = size + make_float3(radius, radius, radius);
+	density_texture_space_invert(loc, size);
+}
+
+static void density_particle_system_texture_space(
+        BL::Object b_ob,
+        BL::ParticleSystem b_particle_system,
+        float radius,
+        float3& loc,
+        float3& size)
+{
+	if(b_particle_system.settings().type() == BL::ParticleSettings::type_HAIR) {
+		/* TODO(sergey): Not supported currently. */
+		return;
+	}
+	Transform tfm = get_transform(b_ob.matrix_world());
+	Transform itfm = transform_inverse(tfm);
+	float3 min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+	       max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+	float3 particle_size = make_float3(radius, radius, radius);
+	for(int i = 0; i < b_particle_system.particles.length(); ++i) {
+		BL::Particle particle = b_particle_system.particles[i];
+		float3 location = get_float3(particle.location());
+		location = transform_point(&itfm, location);
+		min = ccl::min(min, location - particle_size);
+		max = ccl::max(max, location + particle_size);
+	}
+	/* Calculate texture space from the particle bounds.  */
+	loc = (min + max) * 0.5f;
+	size = (max - min) * 0.5f;
+	density_texture_space_invert(loc, size);
+}
+
+}  /* namespace */
+
+void point_density_texture_space(BL::ShaderNodeTexPointDensity b_point_density_node,
+                                 float3& loc,
+                                 float3& size)
+{
+	/* Fallback values. */
+	loc = make_float3(0.0f, 0.0f, 0.0f);
+	size = make_float3(0.0f, 0.0f, 0.0f);
+	BL::Object b_ob(b_point_density_node.object());
+	if(!b_ob) {
+		return;
+	}
+	if(b_point_density_node.point_source() ==
+	   BL::ShaderNodeTexPointDensity::point_source_PARTICLE_SYSTEM)
+	{
+		BL::ParticleSystem b_particle_system(
+		        b_point_density_node.particle_system());
+		if(b_particle_system) {
+			density_particle_system_texture_space(b_ob,
+			                                      b_particle_system,
+			                                      b_point_density_node.radius(),
+			                                      loc,
+			                                      size);
+		}
+	}
+	else {
+		density_object_texture_space(b_ob,
+		                             b_point_density_node.radius(),
+		                             loc,
+		                             size);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/blackbody.h b/intern/cycles/blender/blender_texture.h
index c3be0ebdf30..74fbca02a9e 100644
--- a/intern/cycles/render/blackbody.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2015 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -11,18 +11,21 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
-#ifndef __BLACKBODY_H__
-#define __BLACKBODY_H__
+#ifndef __BLENDER_TEXTURE_H__
+#define __BLENDER_TEXTURE_H__
 
-#include "util_vector.h"
+#include <stdlib.h>
+#include "blender_sync.h"
 
 CCL_NAMESPACE_BEGIN
 
-vector<float> blackbody_table();
+void point_density_texture_space(BL::ShaderNodeTexPointDensity b_point_density_node,
+                                 float3& loc,
+                                 float3& size);
 
 CCL_NAMESPACE_END
 
-#endif /* __BLACKBODY_H__ */
+#endif  /* __BLENDER_TEXTURE_H__ */
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 35e417d8069..165242d0dff 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BLENDER_UTIL_H__
@@ -28,7 +28,7 @@
  * todo: clean this up ... */
 
 extern "C" {
-void BLI_timestr(double _time, char *str, size_t maxlen);
+size_t BLI_timecode_string_from_time_simple(char *str, size_t maxlen, double time_seconds);
 void BKE_image_user_frame_calc(void *iuser, int cfra, int fieldnr);
 void BKE_image_user_file_path(void *iuser, void *ima, char *path);
 unsigned char *BKE_image_get_pixels_for_frame(void *image, int frame);
@@ -43,11 +43,11 @@ void python_thread_state_restore(void **python_thread_state);
 static inline BL::Mesh object_to_mesh(BL::BlendData data, BL::Object object, BL::Scene scene, bool apply_modifiers, bool render, bool calc_undeformed)
 {
 	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);
-	if ((bool)me) {
-		if (me.use_auto_smooth()) {
-			me.calc_normals_split(me.auto_smooth_angle());
+	if((bool)me) {
+		if(me.use_auto_smooth()) {
+			me.calc_normals_split();
 		}
-		me.calc_tessface();
+		me.calc_tessface(true);
 	}
 	return me;
 }
@@ -310,7 +310,7 @@ static inline string get_string(PointerRNA& ptr, const char *name)
 	char cstrbuf[1024];
 	char *cstr = RNA_string_get_alloc(&ptr, name, cstrbuf, sizeof(cstrbuf));
 	string str(cstr);
-	if (cstr != cstrbuf)
+	if(cstr != cstrbuf)
 		MEM_freeN(cstr);
 	
 	return str;
@@ -354,11 +354,20 @@ static inline void mesh_texture_space(BL::Mesh b_mesh, float3& loc, float3& size
 }
 
 /* object used for motion blur */
-static inline bool object_use_motion(BL::Object b_ob)
+static inline bool object_use_motion(BL::Object b_parent, BL::Object b_ob)
 {
 	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
 	bool use_motion = get_boolean(cobject, "use_motion_blur");
-	
+	/* If motion blur is enabled for the object we also check
+	 * whether it's enabled for the parent object as well.
+	 *
+	 * This way we can control motion blur from the dupligroup
+	 * duplicator much easier.
+	 */
+	if(use_motion && b_parent.ptr.data != b_ob.ptr.data) {
+		PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles");
+		use_motion &= get_boolean(parent_cobject, "use_motion_blur");
+	}
 	return use_motion;
 }
 
@@ -375,11 +384,20 @@ static inline uint object_motion_steps(BL::Object b_ob)
 }
 
 /* object uses deformation motion blur */
-static inline bool object_use_deform_motion(BL::Object b_ob)
+static inline bool object_use_deform_motion(BL::Object b_parent, BL::Object b_ob)
 {
 	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
 	bool use_deform_motion = get_boolean(cobject, "use_deform_motion");
-	
+	/* If motion blur is enabled for the object we also check
+	 * whether it's enabled for the parent object as well.
+	 *
+	 * This way we can control motion blur from the dupligroup
+	 * duplicator much easier.
+	 */
+	if(use_deform_motion && b_parent.ptr.data != b_ob.ptr.data) {
+		PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles");
+		use_deform_motion &= get_boolean(parent_cobject, "use_deform_motion");
+	}
 	return use_deform_motion;
 }
 
@@ -388,7 +406,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object b_ob)
 	BL::Object::modifiers_iterator b_mod;
 
 	for(b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
-		if (b_mod->is_a(&RNA_SmokeModifier)) {
+		if(b_mod->is_a(&RNA_SmokeModifier)) {
 			BL::SmokeModifier b_smd(*b_mod);
 
 			if(b_smd.smoke_type() == BL::SmokeModifier::smoke_type_DOMAIN)
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 15bd814b8d5..4a5f8b1bda6 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -25,9 +25,9 @@
 #include "bvh_node.h"
 #include "bvh_params.h"
 
-#include "util_cache.h"
 #include "util_debug.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -69,131 +69,19 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 		return new RegularBVH(params, objects);
 }
 
-/* Cache */
-
-bool BVH::cache_read(CacheData& key)
-{
-	key.add(system_cpu_bits());
-	key.add(&params, sizeof(params));
-
-	foreach(Object *ob, objects) {
-		Mesh *mesh = ob->mesh;
-
-		key.add(mesh->verts);
-		key.add(mesh->triangles);
-		key.add(mesh->curve_keys);
-		key.add(mesh->curves);
-		key.add(&ob->bounds, sizeof(ob->bounds));
-		key.add(&ob->visibility, sizeof(ob->visibility));
-		key.add(&mesh->transform_applied, sizeof(bool));
-
-		if(mesh->use_motion_blur) {
-			Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-			if(attr)
-				key.add(attr->buffer);
-
-			attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-			if(attr)
-				key.add(attr->buffer);
-		}
-	}
-
-	CacheData value;
-
-	if(Cache::global.lookup(key, value)) {
-		cache_filename = key.get_filename();
-
-		if(!(value.read(pack.root_index) &&
-		     value.read(pack.SAH) &&
-		     value.read(pack.nodes) &&
-		     value.read(pack.object_node) &&
-		     value.read(pack.tri_woop) &&
-		     value.read(pack.prim_type) &&
-		     value.read(pack.prim_visibility) &&
-		     value.read(pack.prim_index) &&
-		     value.read(pack.prim_object) &&
-		     value.read(pack.is_leaf)))
-		{
-			/* Clear the pack if load failed. */
-			pack.root_index = 0;
-			pack.SAH = 0.0f;
-			pack.nodes.clear();
-			pack.object_node.clear();
-			pack.tri_woop.clear();
-			pack.prim_type.clear();
-			pack.prim_visibility.clear();
-			pack.prim_index.clear();
-			pack.prim_object.clear();
-			pack.is_leaf.clear();
-			return false;
-		}
-		return true;
-	}
-
-	return false;
-}
-
-void BVH::cache_write(CacheData& key)
-{
-	CacheData value;
-
-	value.add(pack.root_index);
-	value.add(pack.SAH);
-
-	value.add(pack.nodes);
-	value.add(pack.object_node);
-	value.add(pack.tri_woop);
-	value.add(pack.prim_type);
-	value.add(pack.prim_visibility);
-	value.add(pack.prim_index);
-	value.add(pack.prim_object);
-	value.add(pack.is_leaf);
-
-	Cache::global.insert(key, value);
-
-	cache_filename = key.get_filename();
-}
-
-void BVH::clear_cache_except()
-{
-	set<string> except;
-
-	if(!cache_filename.empty())
-		except.insert(cache_filename);
-
-	foreach(Object *ob, objects) {
-		Mesh *mesh = ob->mesh;
-		BVH *bvh = mesh->bvh;
-
-		if(bvh && !bvh->cache_filename.empty())
-			except.insert(bvh->cache_filename);
-	}
-
-	Cache::global.clear_except("bvh", except);
-}
-
 /* Building */
 
 void BVH::build(Progress& progress)
 {
 	progress.set_substatus("Building BVH");
 
-	/* cache read */
-	CacheData key("bvh");
-
-	if(params.use_cache) {
-		progress.set_substatus("Looking in BVH cache");
-
-		if(cache_read(key))
-			return;
-	}
-
 	/* build nodes */
-	vector<int> prim_type;
-	vector<int> prim_index;
-	vector<int> prim_object;
-
-	BVHBuild bvh_build(objects, prim_type, prim_index, prim_object, params, progress);
+	BVHBuild bvh_build(objects,
+	                   pack.prim_type,
+	                   pack.prim_index,
+	                   pack.prim_object,
+	                   params,
+	                   progress);
 	BVHNode *root = bvh_build.run();
 
 	if(progress.get_cancel()) {
@@ -201,11 +89,6 @@ void BVH::build(Progress& progress)
 		return;
 	}
 
-	/* todo: get rid of this copy */
-	pack.prim_type = prim_type;
-	pack.prim_index = prim_index;
-	pack.prim_object = prim_object;
-
 	/* compute SAH */
 	if(!params.top_level)
 		pack.SAH = root->computeSubtreeSAHCost(params);
@@ -226,23 +109,10 @@ void BVH::build(Progress& progress)
 
 	/* pack nodes */
 	progress.set_substatus("Packing BVH nodes");
-	array<int> tmp_prim_object = pack.prim_object;
-	pack_nodes(tmp_prim_object, root);
-	
+	pack_nodes(root);
+
 	/* free build nodes */
 	root->deleteSubtree();
-
-	if(progress.get_cancel()) return;
-
-	/* cache write */
-	if(params.use_cache) {
-		progress.set_substatus("Writing BVH cache");
-		cache_write(key);
-
-		/* clear other bvh files from cache */
-		if(params.top_level)
-			clear_cache_except();
-	}
 }
 
 /* Refitting */
@@ -263,11 +133,9 @@ void BVH::refit(Progress& progress)
 void BVH::pack_triangle(int idx, float4 woop[3])
 {
 	int tob = pack.prim_object[idx];
+	assert(tob >= 0 && tob < objects.size());
 	const Mesh *mesh = objects[tob]->mesh;
 
-	if(mesh->has_motion_blur())
-		return;
-
 	int tidx = pack.prim_index[idx];
 	const int *vidx = mesh->triangles[tidx].v;
 	const float3* vpos = &mesh->verts[0];
@@ -275,68 +143,13 @@ void BVH::pack_triangle(int idx, float4 woop[3])
 	float3 v1 = vpos[vidx[1]];
 	float3 v2 = vpos[vidx[2]];
 
-	float3 r0 = v0 - v2;
-	float3 r1 = v1 - v2;
-	float3 r2 = cross(r0, r1);
-
-	if(is_zero(r0) || is_zero(r1) || is_zero(r2)) {
-		/* degenerate */
-		woop[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-		woop[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-		woop[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-	else {
-		Transform t = make_transform(
-			r0.x, r1.x, r2.x, v2.x,
-			r0.y, r1.y, r2.y, v2.y,
-			r0.z, r1.z, r2.z, v2.z,
-			0.0f, 0.0f, 0.0f, 1.0f);
-
-		t = transform_inverse(t);
-
-		woop[0] = make_float4(t.z.x, t.z.y, t.z.z, -t.z.w);
-		woop[1] = make_float4(t.x.x, t.x.y, t.x.z, t.x.w);
-		woop[2] = make_float4(t.y.x, t.y.y, t.y.z, t.y.w);
-	}
+	woop[0] = float3_to_float4(v0);
+	woop[1] = float3_to_float4(v1);
+	woop[2] = float3_to_float4(v2);
 }
 
 /* Curves*/
 
-void BVH::pack_curve_segment(int idx, float4 woop[3])
-{
-	int tob = pack.prim_object[idx];
-	const Mesh *mesh = objects[tob]->mesh;
-	int tidx = pack.prim_index[idx];
-	int segment = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[idx]);
-	int k0 = mesh->curves[tidx].first_key + segment;
-	int k1 = mesh->curves[tidx].first_key + segment + 1;
-	float3 v0 = float4_to_float3(mesh->curve_keys[k0]);
-	float3 v1 = float4_to_float3(mesh->curve_keys[k1]);
-
-	float3 d0 = v1 - v0;
-	float l =  len(d0);
-	
-	/*Plan
-	*Transform tfm = make_transform(
-	*	location <3>    , l,
-	*	extra curve data <3>    , StrID,
-	*	nextkey, flags/tip?,    0, 0);
-	*/
-	float3 tg0 = make_float3(1.0f, 0.0f, 0.0f);
-	float3 tg1 = make_float3(1.0f, 0.0f, 0.0f);
-	
-	Transform tfm = make_transform(
-		tg0.x, tg0.y, tg0.z, l,
-		tg1.x, tg1.y, tg1.z, 0,
-		0, 0, 0, 0,
-		0, 0, 0, 1);
-
-	woop[0] = tfm.x;
-	woop[1] = tfm.y;
-	woop[2] = tfm.z;
-
-}
-
 void BVH::pack_primitives()
 {
 	int nsize = TRI_NODE_SIZE;
@@ -351,11 +164,14 @@ void BVH::pack_primitives()
 		if(pack.prim_index[i] != -1) {
 			float4 woop[3];
 
-			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
-				pack_curve_segment(i, woop);
-			else
+			if(pack.prim_type[i] & PRIMITIVE_TRIANGLE) {
 				pack_triangle(i, woop);
-			
+			}
+			else {
+				/* Avoid use of uninitialized memory. */
+				memset(&woop, 0, sizeof(woop));
+			}
+
 			memcpy(&pack.tri_woop[i * nsize], woop, sizeof(float4)*3);
 
 			int tob = pack.prim_object[i];
@@ -374,13 +190,14 @@ void BVH::pack_primitives()
 
 /* Pack Instances */
 
-void BVH::pack_instances(size_t nodes_size)
+void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 {
 	/* The BVH's for instances are built separately, but for traversal all
 	 * BVH's are stored in global arrays. This function merges them into the
 	 * top level BVH, adjusting indexes and offsets where appropriate. */
 	bool use_qbvh = params.use_qbvh;
 	size_t nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+	size_t nsize_leaf = (use_qbvh)? BVH_QNODE_LEAF_SIZE: BVH_NODE_LEAF_SIZE;
 
 	/* adjust primitive index to point to the triangle in the global array, for
 	 * meshes with transform applied and already in the top level BVH */
@@ -395,6 +212,7 @@ void BVH::pack_instances(size_t nodes_size)
 	/* track offsets of instanced BVH data in global array */
 	size_t prim_offset = pack.prim_index.size();
 	size_t nodes_offset = nodes_size;
+	size_t nodes_leaf_offset = leaf_nodes_size;
 
 	/* clear array that gives the node indexes for instanced objects */
 	pack.object_node.clear();
@@ -406,6 +224,7 @@ void BVH::pack_instances(size_t nodes_size)
 	size_t pack_prim_index_offset = prim_index_size;
 	size_t pack_tri_woop_offset = tri_woop_size;
 	size_t pack_nodes_offset = nodes_size;
+	size_t pack_leaf_nodes_offset = leaf_nodes_size;
 	size_t object_offset = 0;
 
 	map<Mesh*, int> mesh_map;
@@ -418,7 +237,8 @@ void BVH::pack_instances(size_t nodes_size)
 			if(mesh_map.find(mesh) == mesh_map.end()) {
 				prim_index_size += bvh->pack.prim_index.size();
 				tri_woop_size += bvh->pack.tri_woop.size();
-				nodes_size += bvh->pack.nodes.size()*nsize;
+				nodes_size += bvh->pack.nodes.size();
+				leaf_nodes_size += bvh->pack.leaf_nodes.size();
 
 				mesh_map[mesh] = 1;
 			}
@@ -433,6 +253,7 @@ void BVH::pack_instances(size_t nodes_size)
 	pack.prim_visibility.resize(prim_index_size);
 	pack.tri_woop.resize(tri_woop_size);
 	pack.nodes.resize(nodes_size);
+	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
 
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
@@ -441,6 +262,7 @@ void BVH::pack_instances(size_t nodes_size)
 	uint *pack_prim_visibility = (pack.prim_visibility.size())? &pack.prim_visibility[0]: NULL;
 	float4 *pack_tri_woop = (pack.tri_woop.size())? &pack.tri_woop[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
+	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
 
 	/* merge */
 	foreach(Object *ob, objects) {
@@ -466,12 +288,13 @@ void BVH::pack_instances(size_t nodes_size)
 		BVH *bvh = mesh->bvh;
 
 		int noffset = nodes_offset/nsize;
+		int noffset_leaf = nodes_leaf_offset/nsize_leaf;
 		int mesh_tri_offset = mesh->tri_offset;
 		int mesh_curve_offset = mesh->curve_offset;
 
 		/* fill in node indexes for instances */
-		if((bvh->pack.is_leaf.size() != 0) && bvh->pack.is_leaf[0])
-			pack.object_node[object_offset++] = -noffset-1;
+		if(bvh->pack.root_index == -1)
+			pack.object_node[object_offset++] = -noffset_leaf-1;
 		else
 			pack.object_node[object_offset++] = noffset;
 
@@ -505,11 +328,25 @@ void BVH::pack_instances(size_t nodes_size)
 		}
 
 		/* merge nodes */
+		if(bvh->pack.leaf_nodes.size()) {
+			int4 *leaf_nodes_offset = &bvh->pack.leaf_nodes[0];
+			size_t leaf_nodes_offset_size = bvh->pack.leaf_nodes.size();
+			for(size_t i = 0, j = 0; i < leaf_nodes_offset_size; i+=nsize_leaf, j++) {
+				int4 data = leaf_nodes_offset[i];
+				data.x += prim_offset;
+				data.y += prim_offset;
+				pack_leaf_nodes[pack_leaf_nodes_offset] = data;
+				pack_leaf_nodes_offset += nsize_leaf;
+			}
+		}
+
 		if(bvh->pack.nodes.size()) {
-			size_t nsize_bbox = (use_qbvh)? nsize-2: nsize-1;
+			/* For QBVH we're packing a child bbox into 6 float4,
+			 * and for regular BVH they're packed into 3 float4.
+			 */
+			size_t nsize_bbox = (use_qbvh)? 6: 3;
 			int4 *bvh_nodes = &bvh->pack.nodes[0];
 			size_t bvh_nodes_size = bvh->pack.nodes.size(); 
-			int *bvh_is_leaf = (bvh->pack.is_leaf.size() != 0) ? &bvh->pack.is_leaf[0] : NULL;
 
 			for(size_t i = 0, j = 0; i < bvh_nodes_size; i+=nsize, j++) {
 				memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox*sizeof(int4));
@@ -517,30 +354,29 @@ void BVH::pack_instances(size_t nodes_size)
 				/* modify offsets into arrays */
 				int4 data = bvh_nodes[i + nsize_bbox];
 
-				if(bvh_is_leaf && bvh_is_leaf[j]) {
-					data.x += prim_offset;
-					data.y += prim_offset;
-				}
-				else {
-					data.x += (data.x < 0)? -noffset: noffset;
-					data.y += (data.y < 0)? -noffset: noffset;
+				data.x += (data.x < 0)? -noffset_leaf: noffset;
+				data.y += (data.y < 0)? -noffset_leaf: noffset;
 
-					if(use_qbvh) {
-						data.z += (data.z < 0)? -noffset: noffset;
-						data.w += (data.w < 0)? -noffset: noffset;
-					}
+				if(use_qbvh) {
+					data.z += (data.z < 0)? -noffset_leaf: noffset;
+					data.w += (data.w < 0)? -noffset_leaf: noffset;
 				}
 
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
 
-				if(use_qbvh)
-					pack_nodes[pack_nodes_offset + nsize_bbox+1] = bvh_nodes[i + nsize_bbox+1];
+				/* Usually this copies nothing, but we better
+				 * be prepared for possible node size extension.
+				 */
+				memcpy(&pack_nodes[pack_nodes_offset + nsize_bbox+1],
+				       &bvh_nodes[i + nsize_bbox+1],
+				       sizeof(int4) * (nsize - (nsize_bbox+1)));
 
 				pack_nodes_offset += nsize;
 			}
 		}
 
 		nodes_offset += bvh->pack.nodes.size();
+		nodes_leaf_offset += bvh->pack.leaf_nodes.size();
 		prim_offset += bvh->pack.prim_index.size();
 	}
 }
@@ -554,12 +390,24 @@ RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_
 
 void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1)
+	float4 data[BVH_NODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
 		/* object */
-		pack_node(e.idx, leaf->m_bounds, leaf->m_bounds, ~(leaf->m_lo), 0, leaf->m_visibility, leaf->m_visibility);
-	else
+		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
 		/* triangle */
-		pack_node(e.idx, leaf->m_bounds, leaf->m_bounds, leaf->m_lo, leaf->m_hi, leaf->m_visibility, leaf->m_visibility);
+		data[0].x = __int_as_float(leaf->m_lo);
+		data[0].y = __int_as_float(leaf->m_hi);
+	}
+	data[0].z = __uint_as_float(leaf->m_visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx * BVH_NODE_LEAF_SIZE], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
 }
 
 void RegularBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1)
@@ -580,33 +428,38 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int
 	memcpy(&pack.nodes[idx * BVH_NODE_SIZE], data, sizeof(int4)*BVH_NODE_SIZE);
 }
 
-void RegularBVH::pack_nodes(const array<int>& prims, const BVHNode *root)
+void RegularBVH::pack_nodes(const BVHNode *root)
 {
-	size_t node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	size_t node_size = tot_node_size - leaf_node_size;
 
 	/* resize arrays */
 	pack.nodes.clear();
-	pack.is_leaf.clear();
-	pack.is_leaf.resize(node_size);
 
 	/* for top level BVH, first merge existing BVH's so we know the offsets */
-	if(params.top_level)
-		pack_instances(node_size*BVH_NODE_SIZE);
-	else
+	if(params.top_level) {
+		pack_instances(node_size*BVH_NODE_SIZE,
+		               leaf_node_size*BVH_NODE_LEAF_SIZE);
+	}
+	else {
 		pack.nodes.resize(node_size*BVH_NODE_SIZE);
+		pack.leaf_nodes.resize(leaf_node_size*BVH_NODE_LEAF_SIZE);
+	}
 
-	int nextNodeIdx = 0;
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
 
 	vector<BVHStackEntry> stack;
 	stack.reserve(BVHParams::MAX_DEPTH*2);
-	stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	if(root->is_leaf())
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	else
+		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
 
 	while(stack.size()) {
 		BVHStackEntry e = stack.back();
 		stack.pop_back();
 
-		pack.is_leaf[e.idx] = e.node->is_leaf();
-
 		if(e.node->is_leaf()) {
 			/* leaf node */
 			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
@@ -614,15 +467,17 @@ void RegularBVH::pack_nodes(const array<int>& prims, const BVHNode *root)
 		}
 		else {
 			/* innner node */
-			stack.push_back(BVHStackEntry(e.node->get_child(0), nextNodeIdx++));
-			stack.push_back(BVHStackEntry(e.node->get_child(1), nextNodeIdx++));
+			int idx0 = (e.node->get_child(0)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
+			int idx1 = (e.node->get_child(1)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
+			stack.push_back(BVHStackEntry(e.node->get_child(0), idx0));
+			stack.push_back(BVHStackEntry(e.node->get_child(1), idx1));
 
 			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
 		}
 	}
 
 	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (pack.is_leaf[0])? -1: 0;
+	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
 void RegularBVH::refit_nodes()
@@ -631,17 +486,15 @@ void RegularBVH::refit_nodes()
 
 	BoundBox bbox = BoundBox::empty;
 	uint visibility = 0;
-	refit_node(0, (pack.is_leaf[0])? true: false, bbox, visibility);
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
 void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
-	int4 *data = &pack.nodes[idx*4];
-
-	int c0 = data[3].x;
-	int c1 = data[3].y;
-
 	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx*BVH_NODE_LEAF_SIZE];
+		int c0 = data[0].x;
+		int c1 = data[0].y;
 		/* refit leaf node */
 		for(int prim = c0; prim < c1; prim++) {
 			int pidx = pack.prim_index[prim];
@@ -675,7 +528,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 							size_t steps = mesh->motion_steps - 1;
 							float4 *key_steps = attr->data_float4();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
 						}
 					}
@@ -697,7 +550,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 							size_t steps = mesh->motion_steps - 1;
 							float3 *vert_steps = attr->data_float3();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
 						}
 					}
@@ -707,9 +560,20 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 			visibility |= ob->visibility;
 		}
 
-		pack_node(idx, bbox, bbox, c0, c1, visibility, visibility);
+		/* TODO(sergey): De-duplicate with pack_leaf(). */
+		float4 leaf_data[BVH_NODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c0);
+		leaf_data[0].y = __int_as_float(c1);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(data[0].w);
+		memcpy(&pack.leaf_nodes[idx * BVH_NODE_LEAF_SIZE],
+		       leaf_data,
+		       sizeof(float4)*BVH_NODE_LEAF_SIZE);
 	}
 	else {
+		int4 *data = &pack.nodes[idx*BVH_NODE_SIZE];
+		int c0 = data[3].x;
+		int c1 = data[3].y;
 		/* refit inner node, set bbox from children */
 		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
 		uint visibility0 = 0, visibility1 = 0;
@@ -731,28 +595,28 @@ QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
 	params.use_qbvh = true;
-
-	/* todo: use visibility */
 }
 
 void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
-	float4 data[BVH_QNODE_SIZE];
-
+	float4 data[BVH_QNODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-
 	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
 		/* object */
-		data[6].x = __int_as_float(~(leaf->m_lo));
-		data[6].y = __int_as_float(0);
+		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[6].x = __int_as_float(leaf->m_lo);
-		data[6].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->m_lo);
+		data[0].y = __int_as_float(leaf->m_hi);
+	}
+	data[0].z = __uint_as_float(leaf->m_visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
 	}
 
-	memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE);
+	memcpy(&pack.leaf_nodes[e.idx * BVH_QNODE_LEAF_SIZE], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
 }
 
 void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
@@ -771,20 +635,22 @@ void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
 		data[5][i] = bb_max.z;
 
 		data[6][i] = __int_as_float(en[i].encodeIdx());
-		data[7][i] = 0.0f;
 	}
 
 	for(int i = num; i < 4; i++) {
-		data[0][i] = 0.0f;
-		data[1][i] = 0.0f;
-		data[2][i] = 0.0f;
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+		data[0][i] = FLT_MAX;
+		data[1][i] = -FLT_MAX;
+
+		data[2][i] = FLT_MAX;
+		data[3][i] = -FLT_MAX;
 
-		data[3][i] = 0.0f;
-		data[4][i] = 0.0f;
-		data[5][i] = 0.0f;
+		data[4][i] = FLT_MAX;
+		data[5][i] = -FLT_MAX;
 
 		data[6][i] = __int_as_float(0);
-		data[7][i] = 0.0f;
 	}
 
 	memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE);
@@ -792,33 +658,41 @@ void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
 
 /* Quad SIMD Nodes */
 
-void QBVH::pack_nodes(const array<int>& prims, const BVHNode *root)
+void QBVH::pack_nodes(const BVHNode *root)
 {
-	size_t node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	size_t node_size = tot_node_size - leaf_node_size;
 
 	/* resize arrays */
 	pack.nodes.clear();
-	pack.is_leaf.clear();
-	pack.is_leaf.resize(node_size);
+	pack.leaf_nodes.clear();
 
 	/* for top level BVH, first merge existing BVH's so we know the offsets */
-	if(params.top_level)
-		pack_instances(node_size*BVH_QNODE_SIZE);
-	else
+	if(params.top_level) {
+		pack_instances(node_size*BVH_QNODE_SIZE,
+		               leaf_node_size*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
 		pack.nodes.resize(node_size*BVH_QNODE_SIZE);
+		pack.leaf_nodes.resize(leaf_node_size*BVH_QNODE_LEAF_SIZE);
+	}
 
-	int nextNodeIdx = 0;
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
 
 	vector<BVHStackEntry> stack;
 	stack.reserve(BVHParams::MAX_DEPTH*2);
-	stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	}
 
 	while(stack.size()) {
 		BVHStackEntry e = stack.back();
 		stack.pop_back();
 
-		pack.is_leaf[e.idx] = e.node->is_leaf();
-
 		if(e.node->is_leaf()) {
 			/* leaf node */
 			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
@@ -851,8 +725,16 @@ void QBVH::pack_nodes(const array<int>& prims, const BVHNode *root)
 			}
 
 			/* push entries on the stack */
-			for(int i = 0; i < numnodes; i++)
-				stack.push_back(BVHStackEntry(nodes[i], nextNodeIdx++));
+			for(int i = 0; i < numnodes; i++) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx++;
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
 
 			/* set node */
 			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
@@ -860,13 +742,144 @@ void QBVH::pack_nodes(const array<int>& prims, const BVHNode *root)
 	}
 
 	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (pack.is_leaf[0])? -1: 0;
+	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
 void QBVH::refit_nodes()
 {
-	assert(0); /* todo */
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
-CCL_NAMESPACE_END
+void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx*BVH_QNODE_LEAF_SIZE];
+		int4 c = data[0];
+		/* Refit leaf node. */
+		for(int prim = c.x; prim < c.y; prim++) {
+			int pidx = pack.prim_index[prim];
+			int tob = pack.prim_object[prim];
+			Object *ob = objects[tob];
+
+			if(pidx == -1) {
+				/* Object instance. */
+				bbox.grow(ob->bounds);
+			}
+			else {
+				/* Primitives. */
+				const Mesh *mesh = ob->mesh;
+
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+					/* Curves. */
+					int str_offset = (params.top_level)? mesh->curve_offset: 0;
+					const Mesh::Curve& curve = mesh->curves[pidx - str_offset];
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], bbox);
+
+					visibility |= PATH_RAY_CURVE;
+
+					/* Motion curves. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float4 *key_steps = attr->data_float4();
+
+							for(size_t i = 0; i < steps; i++)
+								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
+						}
+					}
+				}
+				else {
+					/* Triangles. */
+					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+					const Mesh::Triangle& triangle = mesh->triangles[pidx - tri_offset];
+					const float3 *vpos = &mesh->verts[0];
+
+					triangle.bounds_grow(vpos, bbox);
+
+					/* Motion triangles. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++)
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+						}
+					}
+				}
+			}
+
+			visibility |= ob->visibility;
+		}
+
+		/* TODO(sergey): This is actually a copy of pack_leaf(),
+		 * but this chunk of code only knows actual data and has
+		 * no idea about BVHNode.
+		 *
+		 * Would be nice to de-duplicate code, but trying to make
+		 * making code more general ends up in much nastier code
+		 * in my opinion so far.
+		 *
+		 * Same applies to the inner nodes case below.
+		 */
+		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx * BVH_QNODE_LEAF_SIZE],
+		       leaf_data,
+		       sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
+		int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE];
+		int4 c = data[6];
+		/* Refit inner node, set bbox from children. */
+		BoundBox child_bbox[4] = {BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty};
+		uint child_visibility[4] = {0};
+		int num_nodes = 0;
+
+		for(int i = 0; i < 4; ++i) {
+			if(c[i] != 0) {
+				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+				           child_bbox[i], child_visibility[i]);
+				++num_nodes;
+				bbox.grow(child_bbox[i]);
+				visibility |= child_visibility[i];
+			}
+		}
 
+		float4 inner_data[BVH_QNODE_SIZE];
+		for(int i = 0; i < 4; ++i) {
+			float3 bb_min = child_bbox[i].min;
+			float3 bb_max = child_bbox[i].max;
+			inner_data[0][i] = bb_min.x;
+			inner_data[1][i] = bb_max.x;
+			inner_data[2][i] = bb_min.y;
+			inner_data[3][i] = bb_max.y;
+			inner_data[4][i] = bb_min.z;
+			inner_data[5][i] = bb_max.z;
+			inner_data[6][i] = __int_as_float(c[i]);
+		}
+		memcpy(&pack.nodes[idx * BVH_QNODE_SIZE],
+		       inner_data,
+		       sizeof(float4)*BVH_QNODE_SIZE);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 5fcaaaa988c..272a3fa1514 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -20,7 +20,6 @@
 
 #include "bvh_params.h"
 
-#include "util_string.h"
 #include "util_types.h"
 #include "util_vector.h"
 
@@ -30,13 +29,14 @@ class BVHNode;
 struct BVHStackEntry;
 class BVHParams;
 class BoundBox;
-class CacheData;
 class LeafNode;
 class Object;
 class Progress;
 
 #define BVH_NODE_SIZE	4
-#define BVH_QNODE_SIZE	8
+#define BVH_NODE_LEAF_SIZE	1
+#define BVH_QNODE_SIZE	7
+#define BVH_QNODE_LEAF_SIZE	1
 #define BVH_ALIGN		4096
 #define TRI_NODE_SIZE	3
 
@@ -47,7 +47,9 @@ class Progress;
 struct PackedBVH {
 	/* BVH nodes storage, one node is 4x int4, and contains two bounding boxes,
 	 * and child, triangle or object indexes depending on the node type */
-	array<int4> nodes; 
+	array<int4> nodes;
+	/* BVH leaf nodes storage. */
+	array<int4> leaf_nodes;
 	/* object index to BVH node index mapping for instances */
 	array<int> object_node; 
 	/* precomputed triangle intersection data, one triangle is 4x float4 */
@@ -61,9 +63,6 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
-	/* quick array to lookup if a node is a leaf, not used for traversal, only
-	 * for instance BVH merging  */
-	array<int> is_leaf;
 
 	/* index of the root node. */
 	int root_index;
@@ -86,7 +85,6 @@ public:
 	PackedBVH pack;
 	BVHParams params;
 	vector<Object*> objects;
-	string cache_filename;
 
 	static BVH *create(const BVHParams& params, const vector<Object*>& objects);
 	virtual ~BVH() {}
@@ -94,25 +92,18 @@ public:
 	void build(Progress& progress);
 	void refit(Progress& progress);
 
-	void clear_cache_except();
-
 protected:
 	BVH(const BVHParams& params, const vector<Object*>& objects);
 
-	/* cache */
-	bool cache_read(CacheData& key);
-	void cache_write(CacheData& key);
-
 	/* triangles and strands*/
 	void pack_primitives();
 	void pack_triangle(int idx, float4 woop[3]);
-	void pack_curve_segment(int idx, float4 woop[3]);
 
 	/* merge instance BVH's */
-	void pack_instances(size_t nodes_size);
+	void pack_instances(size_t nodes_size, size_t leaf_nodes_size);
 
 	/* for subclasses to implement */
-	virtual void pack_nodes(const array<int>& prims, const BVHNode *root) = 0;
+	virtual void pack_nodes(const BVHNode *root) = 0;
 	virtual void refit_nodes() = 0;
 };
 
@@ -127,7 +118,7 @@ protected:
 	RegularBVH(const BVHParams& params, const vector<Object*>& objects);
 
 	/* pack */
-	void pack_nodes(const array<int>& prims, const BVHNode *root);
+	void pack_nodes(const BVHNode *root);
 	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
 	void pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1);
 	void pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int c0, int c1, uint visibility0, uint visibility1);
@@ -148,12 +139,13 @@ protected:
 	QBVH(const BVHParams& params, const vector<Object*>& objects);
 
 	/* pack */
-	void pack_nodes(const array<int>& prims, const BVHNode *root);
+	void pack_nodes(const BVHNode *root);
 	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
 	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
 
 	/* refit */
 	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index bd37ffbcf38..8745e39c21e 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -29,10 +29,10 @@ CCL_NAMESPACE_BEGIN
 
 /* SSE replacements */
 
-__forceinline void prefetch_L1 (const void* ptr) { }
-__forceinline void prefetch_L2 (const void* ptr) { }
-__forceinline void prefetch_L3 (const void* ptr) { }
-__forceinline void prefetch_NTA(const void* ptr) { }
+__forceinline void prefetch_L1 (const void* /*ptr*/) { }
+__forceinline void prefetch_L2 (const void* /*ptr*/) { }
+__forceinline void prefetch_L3 (const void* /*ptr*/) { }
+__forceinline void prefetch_NTA(const void* /*ptr*/) { }
 
 template<size_t src> __forceinline float extract(const int4& b)
 { return b[src]; }
@@ -76,8 +76,8 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 			prefetch_L2(&prims[start() + i + 8]);
 
 			/* map even and odd primitive to bin */
-			BVHReference prim0 = prims[start() + i + 0];
-			BVHReference prim1 = prims[start() + i + 1];
+			const BVHReference& prim0 = prims[start() + i + 0];
+			const BVHReference& prim1 = prims[start() + i + 1];
 
 			int4 bin0 = get_bin(prim0.bounds());
 			int4 bin1 = get_bin(prim1.bounds());
@@ -96,7 +96,7 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 		/* for uneven number of primitives */
 		if(i < ssize_t(size())) {
 			/* map primitive to bin */
-			BVHReference prim0 = prims[start() + i];
+			const BVHReference& prim0 = prims[start() + i];
 			int4 bin0 = get_bin(prim0.bounds());
 
 			/* increase bounds of bins */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index eb4cca92b6b..45b0aaa2d63 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -28,11 +28,27 @@
 
 #include "util_debug.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_progress.h"
 #include "util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
+#if !defined(__KERNEL_SSE2__)
+/* TODO(sergey): Move to some generic header so all code
+ * can use bitscan on non-SSE processors.
+ */
+ccl_device_inline int bitscan(int value)
+{
+	assert(value != 0);
+	int bit = 0;
+	while(value >>= 1) {
+		++bit;
+	}
+	return bit;
+}
+#endif
+
 /* BVH Build Task */
 
 class BVHBuildTask : public Task {
@@ -49,15 +65,18 @@ public:
 /* Constructor / Destructor */
 
 BVHBuild::BVHBuild(const vector<Object*>& objects_,
-	vector<int>& prim_type_, vector<int>& prim_index_, vector<int>& prim_object_,
-	const BVHParams& params_, Progress& progress_)
-: objects(objects_),
-  prim_type(prim_type_),
-  prim_index(prim_index_),
-  prim_object(prim_object_),
-  params(params_),
-  progress(progress_),
-  progress_start_time(0.0)
+                   array<int>& prim_type_,
+                   array<int>& prim_index_,
+                   array<int>& prim_object_,
+                   const BVHParams& params_,
+                   Progress& progress_)
+ : objects(objects_),
+   prim_type(prim_type_),
+   prim_index(prim_index_),
+   prim_object(prim_object_),
+   params(params_),
+   progress(progress_),
+   progress_start_time(0.0)
 {
 	spatial_min_overlap = 0.0f;
 }
@@ -120,7 +139,7 @@ void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh,
 				size_t steps = mesh->motion_steps - 1;
 				float4 *key_steps = curve_attr_mP->data_float4();
 
-				for (size_t i = 0; i < steps; i++)
+				for(size_t i = 0; i < steps; i++)
 					curve.bounds_grow(k, key_steps + i*mesh_size, bounds);
 
 				type = PRIMITIVE_MOTION_CURVE;
@@ -215,15 +234,22 @@ BVHNode* BVHBuild::run()
 		return NULL;
 
 	/* init spatial splits */
-	if(params.top_level) /* todo: get rid of this */
+	if(params.top_level) {
+		/* NOTE: Technically it is supported by the builder but it's not really
+		 * optimized for speed yet and not really clear yet if it has measurable
+		 * improvement on render time. Needs some extra investigation before
+		 * enabling spatial split for top level BVH.
+		 */
 		params.use_spatial_split = false;
+	}
 
 	spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
 	spatial_right_bounds.clear();
 	spatial_right_bounds.resize(max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1);
 
 	/* init progress updates */
-	progress_start_time = time_dt();
+	double build_start_time;
+	build_start_time = progress_start_time = time_dt();
 	progress_count = 0;
 	progress_total = references.size();
 	progress_original_total = progress_total;
@@ -251,13 +277,25 @@ BVHNode* BVHBuild::run()
 		if(progress.get_cancel()) {
 			rootnode->deleteSubtree();
 			rootnode = NULL;
+			VLOG(1) << "BVH build cancelled.";
 		}
 		else if(!params.use_spatial_split) {
 			/*rotate(rootnode, 4, 5);*/
 			rootnode->update_visibility();
 		}
+		if(rootnode != NULL) {
+			VLOG(1) << "BVH build statistics:\n"
+			        << "  Build time: " << time_dt() - build_start_time << "\n"
+			        << "  Total number of nodes: "
+			        << rootnode->getSubtreeSize(BVH_STAT_NODE_COUNT) << "\n"
+			        << "  Number of inner nodes: "
+			        << rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)  << "\n"
+			        << "  Number of leaf nodes: "
+			        << rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)  << "\n";
+		}
 	}
 
+
 	return rootnode;
 }
 
@@ -308,17 +346,22 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange& range)
 	
 	size_t num_triangles = 0;
 	size_t num_curves = 0;
+	size_t num_motion_curves = 0;
 
 	for(int i = 0; i < size; i++) {
 		BVHReference& ref = references[range.start() + i];
 
-		if(ref.prim_type() & PRIMITIVE_ALL_CURVE)
+		if(ref.prim_type() & PRIMITIVE_CURVE)
 			num_curves++;
+		if(ref.prim_type() & PRIMITIVE_MOTION_CURVE)
+			num_motion_curves++;
 		else if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE)
 			num_triangles++;
 	}
 
-	return (num_triangles < params.max_triangle_leaf_size) && (num_curves < params.max_curve_leaf_size);
+	return (num_triangles < params.max_triangle_leaf_size) &&
+	       (num_curves < params.max_curve_leaf_size) &&
+	       (num_motion_curves < params.max_curve_leaf_size);
 }
 
 /* multithreaded binning builder */
@@ -394,7 +437,7 @@ BVHNode* BVHBuild::build_node(const BVHRange& range, int level)
 	progress_total += left.size() + right.size() - range.size();
 	size_t total = progress_total;
 
-	/* leaft node */
+	/* left node */
 	BVHNode *leftnode = build_node(left, level + 1);
 
 	/* right node (modify start for splits) */
@@ -414,18 +457,10 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		return new LeafNode(bounds, 0, 0, 0);
 	}
 	else if(num == 1) {
-		if(start == prim_index.size()) {
-			assert(params.use_spatial_split);
-
-			prim_type.push_back(ref->prim_type());
-			prim_index.push_back(ref->prim_index());
-			prim_object.push_back(ref->prim_object());
-		}
-		else {
-			prim_type[start] = ref->prim_type();
-			prim_index[start] = ref->prim_index();
-			prim_object[start] = ref->prim_object();
-		}
+		assert(start < prim_type.size());
+		prim_type[start] = ref->prim_type();
+		prim_index[start] = ref->prim_index();
+		prim_object[start] = ref->prim_object();
 
 		uint visibility = objects[ref->prim_object()]->visibility;
 		return new LeafNode(ref->bounds(), visibility, start, start+1);
@@ -443,61 +478,128 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 	}
 }
 
-BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
+BVHNode *BVHBuild::create_primitive_leaf_node(const int *p_type,
+                                              const int *p_index,
+                                              const int *p_object,
+                                              const BoundBox& bounds,
+                                              uint visibility,
+                                              int start,
+                                              int num)
 {
-	vector<int>& p_type = prim_type;
-	vector<int>& p_index = prim_index;
-	vector<int>& p_object = prim_object;
-	BoundBox bounds = BoundBox::empty;
-	int num = 0, ob_num = 0;
-	uint visibility = 0;
+	for(int i = 0; i < num; ++i) {
+		prim_type[start + i] = p_type[i];
+		prim_index[start + i] = p_index[i];
+		prim_object[start + i] = p_object[i];
+	}
+	return new LeafNode(bounds, visibility, start, start + num);
+}
 
+BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
+{
+	/* TODO(sergey): Consider writing own allocator which would
+	 * not do heap allocation if number of elements is relatively small.
+	 */
+	vector<int> p_type[PRIMITIVE_NUM_TOTAL];
+	vector<int> p_index[PRIMITIVE_NUM_TOTAL];
+	vector<int> p_object[PRIMITIVE_NUM_TOTAL];
+	uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
+	/* NOTE: Keep initializtion in sync with actual number of primitives. */
+	BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {BoundBox::empty,
+	                                        BoundBox::empty,
+	                                        BoundBox::empty,
+	                                        BoundBox::empty};
+	int ob_num = 0;
+
+	/* Fill in per-type type/index array. */
 	for(int i = 0; i < range.size(); i++) {
 		BVHReference& ref = references[range.start() + i];
-
 		if(ref.prim_index() != -1) {
-			if(range.start() + num == prim_index.size()) {
-				assert(params.use_spatial_split);
-
-				p_type.push_back(ref.prim_type());
-				p_index.push_back(ref.prim_index());
-				p_object.push_back(ref.prim_object());
-			}
-			else {
-				p_type[range.start() + num] = ref.prim_type();
-				p_index[range.start() + num] = ref.prim_index();
-				p_object[range.start() + num] = ref.prim_object();
-			}
+			int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
+			p_type[type_index].push_back(ref.prim_type());
+			p_index[type_index].push_back(ref.prim_index());
+			p_object[type_index].push_back(ref.prim_object());
 
-			bounds.grow(ref.bounds());
-			visibility |= objects[ref.prim_object()]->visibility;
-			num++;
+			bounds[type_index].grow(ref.bounds());
+			visibility[type_index] |= objects[ref.prim_object()]->visibility;
 		}
 		else {
-			if(ob_num < i)
+			if(ob_num < i) {
 				references[range.start() + ob_num] = ref;
+			}
 			ob_num++;
 		}
 	}
 
-	BVHNode *leaf = NULL;
-	
-	if(num > 0) {
-		leaf = new LeafNode(bounds, visibility, range.start(), range.start() + num);
+	/* Extend an array when needed. */
+	if(prim_type.size() < range.end()) {
+		assert(params.use_spatial_split);
+		prim_type.reserve(references.size());
+		prim_index.reserve(references.size());
+		prim_object.reserve(references.size());
+		prim_type.resize(range.end());
+		prim_index.resize(range.end());
+		prim_object.resize(range.end());
+	}
 
-		if(num == range.size())
-			return leaf;
+	/* Create leaf nodes for every existing primitive. */
+	BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
+	int num_leaves = 0;
+	int start = range.start();
+	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
+		int num = (int)p_type[i].size();
+		if(num != 0) {
+			assert(p_type[i].size() == p_index[i].size());
+			assert(p_type[i].size() == p_object[i].size());
+			leaves[num_leaves] = create_primitive_leaf_node(&p_type[i][0],
+			                                                &p_index[i][0],
+			                                                &p_object[i][0],
+			                                                bounds[i],
+			                                                visibility[i],
+			                                                start,
+			                                                num);
+			++num_leaves;
+			start += num;
+		}
 	}
 
-	/* while there may be multiple triangles in a leaf, for object primitives
-	 * we want there to be the only one, so we keep splitting */
-	const BVHReference *ref = (ob_num)? &references[range.start()]: NULL;
-	BVHNode *oleaf = create_object_leaf_nodes(ref, range.start() + num, ob_num);
-	
-	if(leaf)
-		return new InnerNode(range.bounds(), leaf, oleaf);
-	else
-		return oleaf;
+	/* Create leaf node for object. */
+	if(num_leaves == 0 || ob_num) {
+		/* Only create object leaf nodes if there are objects or no other
+		 * nodes created.
+		 */
+		const BVHReference *ref = (ob_num)? &references[range.start()]: NULL;
+		leaves[num_leaves] = create_object_leaf_nodes(ref, start, ob_num);
+		++num_leaves;
+	}
+
+	if(num_leaves == 1) {
+		/* Simplest case: single leaf, just return it.
+		 * In all the rest cases we'll be creating intermediate inner node with
+		 * an appropriate bounding box.
+		 */
+		return leaves[0];
+	}
+	else if(num_leaves == 2) {
+		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
+	}
+	else if(num_leaves == 3) {
+		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
+		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
+		return new InnerNode(range.bounds(), leaves[0], inner);
+	} else {
+		/* Shpuld be doing more branches if more primitive types added. */
+		assert(num_leaves <= 5);
+		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
+		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
+		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
+		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
+		if(num_leaves == 5) {
+			return new InnerNode(range.bounds(), inner_c, leaves[4]);
+		}
+		return inner_c;
+	}
 }
 
 /* Tree Rotations */
@@ -582,4 +684,3 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index a6b9916de9b..eefb7b60f7c 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -42,13 +42,12 @@ class BVHBuild
 {
 public:
 	/* Constructor/Destructor */
-	BVHBuild(
-		const vector<Object*>& objects,
-		vector<int>& prim_type,
-		vector<int>& prim_index,
-		vector<int>& prim_object,
-		const BVHParams& params,
-		Progress& progress);
+	BVHBuild(const vector<Object*>& objects,
+	         array<int>& prim_type,
+	         array<int>& prim_index,
+	         array<int>& prim_object,
+	         const BVHParams& params,
+	         Progress& progress);
 	~BVHBuild();
 
 	BVHNode *run();
@@ -70,6 +69,15 @@ protected:
 	BVHNode *create_leaf_node(const BVHRange& range);
 	BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);
 
+	/* Leaf node type splitting. */
+	BVHNode *create_primitive_leaf_node(const int *p_type,
+	                                    const int *p_index,
+	                                    const int *p_object,
+	                                    const BoundBox& bounds,
+	                                    uint visibility,
+	                                    int start,
+	                                    int nun);
+
 	bool range_within_max_leaf_size(const BVHRange& range);
 
 	/* threads */
@@ -90,9 +98,9 @@ protected:
 	int num_original_references;
 
 	/* output primitive indexes and objects */
-	vector<int>& prim_type;
-	vector<int>& prim_index;
-	vector<int>& prim_object;
+	array<int>& prim_type;
+	array<int>& prim_index;
+	array<int>& prim_object;
 
 	/* build parameters */
 	BVHParams params;
@@ -116,4 +124,3 @@ protected:
 CCL_NAMESPACE_END
 
 #endif /* __BVH_BUILD_H__ */
-
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 7cc9bd333b0..8294690da7d 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -47,6 +47,20 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 		case BVH_STAT_CHILDNODE_COUNT:
 			cnt = num_children();
 			break;
+		case BVH_STAT_QNODE_COUNT:
+			cnt = 1;
+			for(int i = 0; i < num_children(); i++) {
+				BVHNode *node = get_child(i);
+				if(node->is_leaf()) {
+					cnt += 1;
+				}
+				else {
+					for(int j = 0; j < node->num_children(); j++) {
+						cnt += node->get_child(j)->getSubtreeSize(stat);
+					}
+				}
+			}
+			return cnt;
 		default:
 			assert(0); /* unknown mode */
 	}
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index a0d10a46bfc..44f5518229b 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -24,13 +24,13 @@
 
 CCL_NAMESPACE_BEGIN
 
-enum BVH_STAT
-{
+enum BVH_STAT {
 	BVH_STAT_NODE_COUNT,
 	BVH_STAT_INNER_COUNT,
 	BVH_STAT_LEAF_COUNT,
 	BVH_STAT_TRIANGLE_COUNT,
-	BVH_STAT_CHILDNODE_COUNT
+	BVH_STAT_CHILDNODE_COUNT,
+	BVH_STAT_QNODE_COUNT,
 };
 
 class BVHParams;
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index e073b69472e..faa995c3f29 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -28,7 +28,7 @@ class BVHParams
 {
 public:
 	/* spatial split area threshold */
-	int use_spatial_split;
+	bool use_spatial_split;
 	float spatial_split_alpha;
 
 	/* SAH costs */
@@ -41,15 +41,10 @@ public:
 	int max_curve_leaf_size;
 
 	/* object or mesh level bvh */
-	int top_level;
-
-	/* disk cache */
-	int use_cache;
+	bool top_level;
 
 	/* QBVH */
-	int use_qbvh;
-
-	int pad;
+	bool use_qbvh;
 
 	/* fixed parameters */
 	enum {
@@ -73,9 +68,7 @@ public:
 		max_curve_leaf_size = 2;
 
 		top_level = false;
-		use_cache = false;
 		use_qbvh = false;
-		pad = false;
 	}
 
 	/* SAH costs */
@@ -115,6 +108,13 @@ public:
 	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
 	__forceinline int prim_type() const { return type; }
 
+	BVHReference& operator=(const BVHReference &arg) {
+		if(&arg != this) {
+			memcpy(this, &arg, sizeof(BVHReference));
+		}
+		return *this;
+	}
+
 protected:
 	BoundBox rbounds;
 	uint type;
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index 07c35c08c18..534c1aa73b5 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -191,11 +191,16 @@ void BVHSpatialSplit::split(BVHBuild *builder, BVHRange& left, BVHRange& right,
 		}
 	}
 
-	/* duplicate or unsplit references intersecting both sides. */
+	/* Duplicate or unsplit references intersecting both sides.
+	 *
+	 * Duplication happens into a temporary pre-allocated vector in order to
+	 * reduce number of memmove() calls happening in vector.insert().
+	 */
+	vector<BVHReference> new_refs;
+	new_refs.reserve(right_start - left_end);
 	while(left_end < right_start) {
 		/* split reference. */
 		BVHReference lref, rref;
-
 		split_reference(builder, lref, rref, refs[left_end], this->dim, this->pos);
 
 		/* compute SAH for duplicate/unsplit candidates. */
@@ -234,61 +239,36 @@ void BVHSpatialSplit::split(BVHBuild *builder, BVHRange& left, BVHRange& right,
 			left_bounds = ldb;
 			right_bounds = rdb;
 			refs[left_end++] = lref;
-			refs.insert(refs.begin() + right_end, rref);
+			new_refs.push_back(rref);
 			right_end++;
 		}
 	}
-
+	/* Insert duplicated references into actual array in one go. */
+	if(new_refs.size() != 0) {
+		refs.insert(refs.begin() + right_end - new_refs.size(),
+		            new_refs.begin(),
+		            new_refs.end());
+	}
 	left = BVHRange(left_bounds, left_start, left_end - left_start);
 	right = BVHRange(right_bounds, right_start, right_end - right_start);
 }
 
-void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVHReference& right, const BVHReference& ref, int dim, float pos)
+void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh,
+                                               const Transform *tfm,
+                                               int prim_index,
+                                               int dim,
+                                               float pos,
+                                               BoundBox& left_bounds,
+                                               BoundBox& right_bounds)
 {
-	/* initialize boundboxes */
-	BoundBox left_bounds = BoundBox::empty;
-	BoundBox right_bounds = BoundBox::empty;
-
-	/* loop over vertices/edges. */
-	Object *ob = builder->objects[ref.prim_object()];
-	const Mesh *mesh = ob->mesh;
-
-	if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
-		const int *inds = mesh->triangles[ref.prim_index()].v;
-		const float3 *verts = &mesh->verts[0];
-		const float3* v1 = &verts[inds[2]];
-
-		for(int i = 0; i < 3; i++) {
-			const float3* v0 = v1;
-			int vindex = inds[i];
-			v1 = &verts[vindex];
-			float v0p = (*v0)[dim];
-			float v1p = (*v1)[dim];
-
-			/* insert vertex to the boxes it belongs to. */
-			if(v0p <= pos)
-				left_bounds.grow(*v0);
-
-			if(v0p >= pos)
-				right_bounds.grow(*v0);
-
-			/* edge intersects the plane => insert intersection to both boxes. */
-			if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
-				float3 t = lerp(*v0, *v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
-				left_bounds.grow(t);
-				right_bounds.grow(t);
-			}
-		}
-	}
-	else {
-		/* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
-		const int k0 = mesh->curves[ref.prim_index()].first_key + PRIMITIVE_UNPACK_SEGMENT(ref.prim_type());
-		const int k1 = k0 + 1;
-		const float4 key0 = mesh->curve_keys[k0];
-		const float4 key1 = mesh->curve_keys[k1];
-		const float3 v0 = float4_to_float3(key0);
-		const float3 v1 = float4_to_float3(key1);
-
+	const int *inds = mesh->triangles[prim_index].v;
+	const float3 *verts = &mesh->verts[0];
+	float3 v1 = tfm ? transform_point(tfm, verts[inds[2]]) : verts[inds[2]];
+
+	for(int i = 0; i < 3; i++) {
+		float3 v0 = v1;
+		int vindex = inds[i];
+		v1 = tfm ? transform_point(tfm, verts[vindex]) : verts[vindex];
 		float v0p = v0[dim];
 		float v1p = v1[dim];
 
@@ -299,12 +279,6 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 		if(v0p >= pos)
 			right_bounds.grow(v0);
 
-		if(v1p <= pos)
-			left_bounds.grow(v1);
-
-		if(v1p >= pos)
-			right_bounds.grow(v1);
-
 		/* edge intersects the plane => insert intersection to both boxes. */
 		if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
 			float3 t = lerp(v0, v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
@@ -312,6 +286,159 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 			right_bounds.grow(t);
 		}
 	}
+}
+
+void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
+                                            const Transform *tfm,
+                                            int prim_index,
+                                            int segment_index,
+                                            int dim,
+                                            float pos,
+                                            BoundBox& left_bounds,
+                                            BoundBox& right_bounds)
+{
+	/* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
+	const int k0 = mesh->curves[prim_index].first_key + segment_index;
+	const int k1 = k0 + 1;
+	const float4& key0 = mesh->curve_keys[k0];
+	const float4& key1 = mesh->curve_keys[k1];
+	float3 v0 = float4_to_float3(key0);
+	float3 v1 = float4_to_float3(key1);
+
+	if(tfm != NULL) {
+		v0 = transform_point(tfm, v0);
+		v1 = transform_point(tfm, v1);
+	}
+
+	float v0p = v0[dim];
+	float v1p = v1[dim];
+
+	/* insert vertex to the boxes it belongs to. */
+	if(v0p <= pos)
+		left_bounds.grow(v0);
+
+	if(v0p >= pos)
+		right_bounds.grow(v0);
+
+	if(v1p <= pos)
+		left_bounds.grow(v1);
+
+	if(v1p >= pos)
+		right_bounds.grow(v1);
+
+	/* edge intersects the plane => insert intersection to both boxes. */
+	if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
+		float3 t = lerp(v0, v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
+		left_bounds.grow(t);
+		right_bounds.grow(t);
+	}
+}
+
+void BVHSpatialSplit::split_triangle_reference(const BVHReference& ref,
+                                               const Mesh *mesh,
+                                               int dim,
+                                               float pos,
+                                               BoundBox& left_bounds,
+                                               BoundBox& right_bounds)
+{
+	split_triangle_primitive(mesh,
+	                         NULL,
+	                         ref.prim_index(),
+	                         dim,
+	                         pos,
+	                         left_bounds,
+	                         right_bounds);
+}
+
+void BVHSpatialSplit::split_curve_reference(const BVHReference& ref,
+                                            const Mesh *mesh,
+                                            int dim,
+                                            float pos,
+                                            BoundBox& left_bounds,
+                                            BoundBox& right_bounds)
+{
+	split_curve_primitive(mesh,
+	                      NULL,
+	                      ref.prim_index(),
+	                      PRIMITIVE_UNPACK_SEGMENT(ref.prim_type()),
+	                      dim,
+	                      pos,
+	                      left_bounds,
+	                      right_bounds);
+}
+
+void BVHSpatialSplit::split_object_reference(const Object *object,
+                                             int dim,
+                                             float pos,
+                                             BoundBox& left_bounds,
+                                             BoundBox& right_bounds)
+{
+	Mesh *mesh = object->mesh;
+	for(int tri_idx = 0; tri_idx < mesh->triangles.size(); ++tri_idx) {
+		split_triangle_primitive(mesh,
+		                         &object->tfm,
+		                         tri_idx,
+		                         dim,
+		                         pos,
+		                         left_bounds,
+		                         right_bounds);
+	}
+	for(int curve_idx = 0; curve_idx < mesh->curves.size(); ++curve_idx) {
+		Mesh::Curve &curve = mesh->curves[curve_idx];
+		for(int segment_idx = 0;
+		    segment_idx < curve.num_keys - 1;
+		    ++segment_idx)
+		{
+			split_curve_primitive(mesh,
+			                      &object->tfm,
+			                      curve_idx,
+			                      segment_idx,
+			                      dim,
+			                      pos,
+			                      left_bounds,
+			                      right_bounds);
+		}
+	}
+}
+
+void BVHSpatialSplit::split_reference(BVHBuild *builder,
+                                      BVHReference& left,
+                                      BVHReference& right,
+                                      const BVHReference& ref,
+                                      int dim,
+                                      float pos)
+{
+	/* initialize boundboxes */
+	BoundBox left_bounds = BoundBox::empty;
+	BoundBox right_bounds = BoundBox::empty;
+
+	/* loop over vertices/edges. */
+	Object *ob = builder->objects[ref.prim_object()];
+	const Mesh *mesh = ob->mesh;
+
+	if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+		split_triangle_reference(ref,
+		                         mesh,
+		                         dim,
+		                         pos,
+		                         left_bounds,
+		                         right_bounds);
+	}
+	else if(ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+		split_curve_reference(ref,
+		                      mesh,
+		                      dim,
+		                      pos,
+		                      left_bounds,
+		                      right_bounds);
+	}
+	else {
+		split_object_reference(ob,
+		                       dim,
+		                       pos,
+		                       left_bounds,
+		                       right_bounds);
+	}
 
 	/* intersect with original bounds. */
 	left_bounds.max[dim] = pos;
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index 5b739311e5f..1e46bb66203 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -55,7 +55,58 @@ public:
 	BVHSpatialSplit(BVHBuild *builder, const BVHRange& range, float nodeSAH);
 
 	void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range);
-	void split_reference(BVHBuild *builder, BVHReference& left, BVHReference& right, const BVHReference& ref, int dim, float pos);
+	void split_reference(BVHBuild *builder,
+	                     BVHReference& left,
+	                     BVHReference& right,
+	                     const BVHReference& ref,
+	                     int dim,
+	                     float pos);
+
+protected:
+	/* Lower-level functions which calculates boundaries of left and right nodes
+	 * needed for spatial split.
+	 *
+	 * Operates directly with primitive specified by it's index, reused by higher
+	 * level splitting functions.
+	 */
+	void split_triangle_primitive(const Mesh *mesh,
+	                              const Transform *tfm,
+	                              int prim_index,
+	                              int dim,
+	                              float pos,
+	                              BoundBox& left_bounds,
+	                              BoundBox& right_bounds);
+	void split_curve_primitive(const Mesh *mesh,
+	                           const Transform *tfm,
+	                           int prim_index,
+	                           int segment_index,
+	                           int dim,
+	                           float pos,
+	                           BoundBox& left_bounds,
+	                           BoundBox& right_bounds);
+
+	/* Lower-level functions which calculates boundaries of left and right nodes
+	 * needed for spatial split.
+	 *
+	 * Operates with BVHReference, internally uses lower level API functions.
+	 */
+	void split_triangle_reference(const BVHReference& ref,
+	                              const Mesh *mesh,
+	                              int dim,
+	                              float pos,
+	                              BoundBox& left_bounds,
+	                              BoundBox& right_bounds);
+	void split_curve_reference(const BVHReference& ref,
+	                           const Mesh *mesh,
+	                           int dim,
+	                           float pos,
+	                           BoundBox& left_bounds,
+	                           BoundBox& right_bounds);
+	void split_object_reference(const Object *object,
+	                            int dim,
+	                            float pos,
+	                            BoundBox& left_bounds,
+	                            BoundBox& right_bounds);
 };
 
 /* Mixed Object-Spatial Split */
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index bb174d6df26..10a166b6e44 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -1,3 +1,11 @@
+###########################################################################
+# Precompiled libraries tips and hints, for find_package().
+
+if(CYCLES_STANDALONE_REPOSITORY)
+	if(APPLE OR WIN32)
+		include(precompiled_libs)
+	endif()
+endif()
 
 ###########################################################################
 # GLUT
@@ -14,10 +22,16 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
 	)
 endif()
 
-if(WITH_SYSTEM_GLEW)
-	set(CYCLES_GLEW_LIBRARY ${GLEW_LIBRARY})
-else()
-	set(CYCLES_GLEW_LIBRARY extern_glew)
+###########################################################################
+# GLEW
+
+# Workaround for unconventional variable name use in Blender.
+if(NOT CYCLES_STANDALONE_REPOSITORY)
+	set(GLEW_INCLUDE_DIR "${GLEW_INCLUDE_PATH}")
+endif()
+
+if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
+	set(CYCLES_APP_GLEW_LIBRARY ${BLENDER_GLEW_LIBRARIES})
 endif()
 
 ###########################################################################
@@ -33,3 +47,97 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 endif()
 
+# Packages which are being found by Blender when building from inside Blender
+# source code. but which we need to take care of when building Cycles from a
+# standalone repository
+if(CYCLES_STANDALONE_REPOSITORY)
+	# PThreads
+	# TODO(sergey): Bloody exception, handled in precompiled_libs.cmake.
+	if(NOT WIN32)
+		set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+		find_package(Threads REQUIRED)
+		set(PTHREADS_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+	endif()
+
+	####
+	# OpenGL
+
+	# TODO(sergey): We currently re-use the same variable name as we use
+	# in Blender. Ideally we need to make it CYCLES_GL_LIBRARIES.
+	find_package(OpenGL REQUIRED)
+	find_package(GLEW REQUIRED)
+	list(APPEND BLENDER_GL_LIBRARIES
+		"${OPENGL_gl_LIBRARY}"
+		"${OPENGL_glu_LIBRARY}"
+		"${GLEW_LIBRARY}"
+	)
+
+	####
+	# OpenImageIO
+	find_package(OpenImageIO REQUIRED)
+	if(OPENIMAGEIO_PUGIXML_FOUND)
+		set(PUGIXML_INCLUDE_DIR "${OPENIMAGEIO_INCLUDE_DIR/OpenImageIO}")
+		set(PUGIXML_LIBRARIES "")
+	else()
+		find_package(PugiXML REQUIRED)
+	endif()
+
+	# OIIO usually depends on OpenEXR, so find this library
+	# but don't make it required.
+	find_package(OpenEXR)
+
+	####
+	# OpenShadingLanguage
+	if(WITH_CYCLES_OSL)
+		find_package(OpenShadingLanguage REQUIRED)
+		find_package(LLVM REQUIRED)
+	endif()
+
+	####
+	# Boost
+	set(__boost_packages filesystem regex system thread date_time)
+	if(WITH_CYCLES_NETWORK)
+		list(APPEND __boost_packages serialization)
+	endif()
+	if(WITH_CYCLES_OSL)
+		# TODO(sergey): This is because of the way how our precompiled
+		# libraries works, could be different for someone's else libs..
+		if(APPLE OR MSVC)
+			list(APPEND __boost_packages wave)
+		elseif(NOT (${OSL_LIBRARY_VERSION_MAJOR} EQUAL "1" AND ${OSL_LIBRARY_VERSION_MINOR} LESS "6"))
+			list(APPEND __boost_packages wave)
+		endif()
+	endif()
+	find_package(Boost 1.48 COMPONENTS ${__boost_packages} REQUIRED)
+	if(NOT Boost_FOUND)
+		# Try to find non-multithreaded if -mt not found, this flag
+		# doesn't matter for us, it has nothing to do with thread
+		# safety, but keep it to not disturb build setups.
+		set(Boost_USE_MULTITHREADED OFF)
+		find_package(Boost 1.48 COMPONENTS ${__boost_packages})
+	endif()
+	unset(__boost_packages)
+	set(BOOST_INCLUDE_DIR ${Boost_INCLUDE_DIRS})
+	set(BOOST_LIBRARIES ${Boost_LIBRARIES})
+	set(BOOST_LIBPATH ${Boost_LIBRARY_DIRS})
+	set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB")
+
+	####
+	# Logging
+	if(WITH_CYCLES_LOGGING)
+		find_package(Glog REQUIRED)
+		find_package(Gflags REQUIRED)
+	endif()
+
+	unset(_lib_DIR)
+else()
+	if(WIN32)
+		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/libmv/third_party/glog/src/windows)
+		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/libmv/third_party/gflags)
+	else()
+		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/libmv/third_party/glog/src)
+		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/libmv/third_party/gflags)
+	endif()
+	set(GFLAGS_NAMESPACE "gflags")
+	set(LLVM_LIBRARIES ${LLVM_LIBRARY})
+endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index a62ce29f722..b0fa283c1d8 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -6,11 +6,11 @@ set(INC
 	../kernel/osl
 	../util
 	../render
+	../../glew-mx
 )
 
 set(INC_SYS
-	${OPENGL_INCLUDE_DIR}
-	${GLEW_INCLUDE_PATH}
+	${GLEW_INCLUDE_DIR}
 	../../../extern/cuew/include
 	../../../extern/clew/include
 )
@@ -38,7 +38,7 @@ set(SRC_HEADERS
 	device_task.h
 )
 
-add_definitions(-DGLEW_STATIC)
+add_definitions(${GL_DEFINITIONS})
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index efdfa98cfb5..fc9959e0b48 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -20,9 +20,6 @@
 #include "device.h"
 #include "device_intern.h"
 
-#include "cuew.h"
-#include "clew.h"
-
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_half.h"
@@ -34,8 +31,38 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Device Requested Features */
+
+std::ostream& operator <<(std::ostream &os,
+                          const DeviceRequestedFeatures& requested_features)
+{
+	os << "Experimental features: "
+	   << (requested_features.experimental ? "On" : "Off") << std::endl;
+	os << "Max closure count: " << requested_features.max_closure << std::endl;
+	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
+	/* TODO(sergey): Decode bitflag into list of names. */
+	os << "Nodes features: " << requested_features.nodes_features << std::endl;
+	/* TODO(sergey): Make it utility function to convert bool to string. */
+	os << "Use hair: "
+	   << (requested_features.use_hair ? "True" : "False")  << std::endl;
+	os << "Use object motion: "
+	   << (requested_features.use_object_motion ? "True" : "False")  << std::endl;
+	os << "Use camera motion: "
+	   << (requested_features.use_camera_motion ? "True" : "False")  << std::endl;
+	os << "Use Baking: "
+	   << (requested_features.use_baking ? "True" : "False")  << std::endl;
+	return os;
+}
+
 /* Device */
 
+Device::~Device()
+{
+	if(!background && vertex_buffer != 0) {
+		glDeleteBuffers(1, &vertex_buffer);
+	}
+}
+
 void Device::pixels_alloc(device_memory& mem)
 {
 	mem_alloc(mem, MEM_READ_WRITE);
@@ -54,7 +81,7 @@ void Device::pixels_free(device_memory& mem)
 	mem_free(mem);
 }
 
-void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 	const DeviceDrawParams &draw_params)
 {
 	pixels_copy_from(rgba, y, w, h);
@@ -70,6 +97,9 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 		/* for multi devices, this assumes the inefficient method that we allocate
 		 * all pixels on the device even though we only render to a subset */
 		GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
+		float vbuffer[16], *basep;
+		float *vp = NULL;
+
 		data_pointer += 4*y*w;
 
 		/* draw half float texture, GLSL shader for display transform assumed to be bound */
@@ -86,23 +116,63 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 			draw_params.bind_display_space_shader_cb();
 		}
 
-		glPushMatrix();
-		glTranslatef(0.0f, (float)dy, 0.0f);
+		if(GLEW_VERSION_1_5) {
+			if(!vertex_buffer)
+				glGenBuffers(1, &vertex_buffer);
+
+			glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+			/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+			glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+			vp = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+			basep = NULL;
+		}
+		else {
+			basep = vbuffer;
+			vp = vbuffer;
+		}
+
+		if(vp) {
+			/* texture coordinate - vertex pair */
+			vp[0] = 0.0f;
+			vp[1] = 0.0f;
+			vp[2] = dx;
+			vp[3] = dy;
+
+			vp[4] = 1.0f;
+			vp[5] = 0.0f;
+			vp[6] = (float)width + dx;
+			vp[7] = dy;
+
+			vp[8] = 1.0f;
+			vp[9] = 1.0f;
+			vp[10] = (float)width + dx;
+			vp[11] = (float)height + dy;
+
+			vp[12] = 0.0f;
+			vp[13] = 1.0f;
+			vp[14] = dx;
+			vp[15] = (float)height + dy;
+
+			if(vertex_buffer)
+				glUnmapBuffer(GL_ARRAY_BUFFER);
+		}
+
+		glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), basep);
+		glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), ((char *)basep) + 2 * sizeof(float));
 
-		glBegin(GL_QUADS);
-		
-		glTexCoord2f(0.0f, 0.0f);
-		glVertex2f(0.0f, 0.0f);
-		glTexCoord2f(1.0f, 0.0f);
-		glVertex2f((float)width, 0.0f);
-		glTexCoord2f(1.0f, 1.0f);
-		glVertex2f((float)width, (float)height);
-		glTexCoord2f(0.0f, 1.0f);
-		glVertex2f(0.0f, (float)height);
+		glEnableClientState(GL_VERTEX_ARRAY);
+		glEnableClientState(GL_TEXTURE_COORD_ARRAY);
 
-		glEnd();
+		glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-		glPopMatrix();
+		glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+		glDisableClientState(GL_VERTEX_ARRAY);
+
+		if(vertex_buffer) {
+			glBindBuffer(GL_ARRAY_BUFFER, 0);
+		}
 
 		if(draw_params.unbind_display_space_shader_cb) {
 			draw_params.unbind_display_space_shader_cb();
@@ -116,7 +186,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 		/* fallback for old graphics cards that don't support GLSL, half float,
 		 * and non-power-of-two textures */
 		glPixelZoom((float)width/(float)w, (float)height/(float)h);
-		glRasterPos2f(0, dy);
+		glRasterPos2f(dx, dy);
 
 		uint8_t *pixels = (uint8_t*)rgba.data_pointer;
 
@@ -268,5 +338,25 @@ vector<DeviceInfo>& Device::available_devices()
 	return devices;
 }
 
-CCL_NAMESPACE_END
+string Device::device_capabilities()
+{
+	string capabilities = "CPU device capabilities: ";
+	capabilities += device_cpu_capabilities() + "\n";
+#ifdef WITH_CUDA
+	if(device_cuda_init()) {
+		capabilities += "\nCUDA device capabilities:\n";
+		capabilities += device_cuda_capabilities();
+	}
+#endif
+
+#ifdef WITH_OPENCL
+	if(device_opencl_init()) {
+		capabilities += "\nOpenCL device capabilities:\n";
+		capabilities += device_opencl_capabilities();
+	}
+#endif
+
+	return capabilities;
+}
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 20ebfd391d6..3c0fb880948 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __DEVICE_H__
@@ -55,6 +55,7 @@ public:
 	bool advanced_shading;
 	bool pack_images;
 	bool extended_images; /* flag for GPU and Multi device */
+	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
 
 	DeviceInfo()
@@ -66,25 +67,88 @@ public:
 		advanced_shading = true;
 		pack_images = false;
 		extended_images = false;
+		use_split_kernel = false;
 	}
 };
 
+class DeviceRequestedFeatures {
+public:
+	/* Use experimental feature set. */
+	bool experimental;
+
+	/* Maximum number of closures in shader trees. */
+	int max_closure;
+
+	/* Selective nodes compilation. */
+
+	/* Identifier of a node group up to which all the nodes needs to be
+	 * compiled in. Nodes from higher group indices will be ignores.
+	 */
+	int max_nodes_group;
+
+	/* Features bitfield indicating which features from the requested group
+	 * will be compiled in. Nodes which corresponds to features which are not
+	 * in this bitfield will be ignored even if they're in the requested group.
+	 */
+	int nodes_features;
+
+	/* BVH/sampling kernel features. */
+	bool use_hair;
+	bool use_object_motion;
+	bool use_camera_motion;
+
+	/* Denotes whether baking functionality is needed. */
+	bool use_baking;
+
+	DeviceRequestedFeatures()
+	{
+		/* TODO(sergey): Find more meaningful defaults. */
+		experimental = false;
+		max_closure = 0;
+		max_nodes_group = 0;
+		nodes_features = 0;
+		use_hair = false;
+		use_object_motion = false;
+		use_camera_motion = false;
+		use_baking = false;
+	}
+
+	bool modified(const DeviceRequestedFeatures& requested_features)
+	{
+		return !(experimental == requested_features.experimental &&
+		         max_closure == requested_features.max_closure &&
+		         max_nodes_group == requested_features.max_nodes_group &&
+		         nodes_features == requested_features.nodes_features &&
+		         use_hair == requested_features.use_hair &&
+		         use_object_motion == requested_features.use_object_motion &&
+		         use_camera_motion == requested_features.use_camera_motion &&
+		         use_baking == requested_features.use_baking);
+	}
+
+};
+
+std::ostream& operator <<(std::ostream &os,
+                          const DeviceRequestedFeatures& requested_features);
+
 /* Device */
 
 struct DeviceDrawParams {
-	boost::function<void(void)> bind_display_space_shader_cb;
-	boost::function<void(void)> unbind_display_space_shader_cb;
+	function<void(void)> bind_display_space_shader_cb;
+	function<void(void)> unbind_display_space_shader_cb;
 };
 
 class Device {
 protected:
-	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), info(info_), stats(stats_) {}
+	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
 
 	bool background;
 	string error_msg;
 
+	/* used for real time display */
+	unsigned int vertex_buffer;
+
 public:
-	virtual ~Device() {}
+	virtual ~Device();
 
 	/* info */
 	DeviceInfo info;
@@ -106,9 +170,15 @@ public:
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
 	/* texture memory */
-	virtual void tex_alloc(const char *name, device_memory& mem,
-		InterpolationType interpolation = INTERPOLATION_NONE, bool periodic = false) {};
-	virtual void tex_free(device_memory& mem) {};
+	virtual void tex_alloc(const char * /*name*/,
+	                       device_memory& /*mem*/,
+	                       InterpolationType interpolation = INTERPOLATION_NONE,
+	                       ExtensionType extension = EXTENSION_REPEAT)
+	{
+		(void)interpolation;  /* Ignored. */
+		(void)extension;  /* Ignored. */
+	};
+	virtual void tex_free(device_memory& /*mem*/) {};
 
 	/* pixel memory */
 	virtual void pixels_alloc(device_memory& mem);
@@ -119,7 +189,9 @@ public:
 	virtual void *osl_memory() { return NULL; }
 
 	/* load/compile kernels, must be called before adding tasks */ 
-	virtual bool load_kernels(bool experimental) { return true; }
+	virtual bool load_kernels(
+	        const DeviceRequestedFeatures& /*requested_features*/)
+	{ return true; }
 
 	/* tasks */
 	virtual int get_split_task_count(DeviceTask& task) = 0;
@@ -129,7 +201,7 @@ public:
 	
 	/* opengl drawing */
 	virtual void draw_pixels(device_memory& mem, int y, int w, int h,
-		int dy, int width, int height, bool transparent,
+		int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params);
 
 #ifdef WITH_NETWORK
@@ -138,8 +210,8 @@ public:
 #endif
 
 	/* multi device */
-	virtual void map_tile(Device *sub_device, RenderTile& tile) {}
-	virtual int device_number(Device *sub_device) { return 0; }
+	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
+	virtual int device_number(Device * /*sub_device*/) { return 0; }
 
 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
@@ -148,6 +220,7 @@ public:
 	static string string_from_type(DeviceType type);
 	static vector<DeviceType>& available_types();
 	static vector<DeviceInfo>& available_devices();
+	static string device_capabilities();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 4623764d210..f06963c146e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -11,12 +11,26 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
 #include <string.h>
 
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+#  if defined(_MSC_VER)
+/* Prevent OSL from polluting the context with weird macros from windows.h.
+ * TODO(sergey): Ideally it's only enough to have class/struct declarations in
+ * the header and skip header include here.
+ */
+#    define NOGDI
+#    define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <OSL/oslexec.h>
+#endif
+
 #include "device.h"
 #include "device_intern.h"
 
@@ -33,6 +47,7 @@
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_opengl.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -70,19 +85,21 @@ public:
 		task_pool.stop();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		mem.device_pointer = mem.data_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(device_memory& /*mem*/)
 	{
 		/* no-op */
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(device_memory& /*mem*/,
+	                   int /*y*/, int /*w*/, int /*h*/,
+	                   int /*elem*/)
 	{
 		/* no-op */
 	}
@@ -106,9 +123,20 @@ public:
 		kernel_const_copy(&kernel_globals, name, host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType interpolation,
+	               ExtensionType extension)
 	{
-		kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		kernel_tex_copy(&kernel_globals,
+		                name,
+		                mem.data_pointer,
+		                mem.data_width,
+		                mem.data_height,
+		                mem.data_depth,
+		                interpolation,
+		                extension);
 		mem.device_pointer = mem.data_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
@@ -165,141 +193,58 @@ public:
 #endif
 
 		RenderTile tile;
-		
-		while(task.acquire_tile(this, tile)) {
-			float *render_buffer = (float*)tile.buffer;
-			uint *rng_state = (uint*)tile.rng_state;
-			int start_sample = tile.start_sample;
-			int end_sample = tile.start_sample + tile.num_samples;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
-													  sample, x, y, tile.offset, tile.stride);
-						}
-					}
 
-					tile.sample = sample + 1;
+		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
 
-					task.update_progress(&tile);
-				}
-			}
-			else
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2())
+			path_trace_kernel = kernel_cpu_avx2_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+		if(system_cpu_support_avx())
+			path_trace_kernel = kernel_cpu_avx_path_trace;
+		else
 #endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41())
+			path_trace_kernel = kernel_cpu_sse41_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+		if(system_cpu_support_sse3())
+			path_trace_kernel = kernel_cpu_sse3_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
+		if(system_cpu_support_sse2())
+			path_trace_kernel = kernel_cpu_sse2_path_trace;
+		else
+#endif
+			path_trace_kernel = kernel_cpu_path_trace;
+		
+		while(task.acquire_tile(this, tile)) {
+			float *render_buffer = (float*)tile.buffer;
+			uint *rng_state = (uint*)tile.rng_state;
+			int start_sample = tile.start_sample;
+			int end_sample = tile.start_sample + tile.num_samples;
 
-					task.update_progress(&tile);
+			for(int sample = start_sample; sample < end_sample; sample++) {
+				if(task.get_cancel() || task_pool.canceled()) {
+					if(task.need_finish_queue == false)
+						break;
 				}
-			}
-			else
-#endif
-			{
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
 
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
+				for(int y = tile.y; y < tile.y + tile.h; y++) {
+					for(int x = tile.x; x < tile.x + tile.w; x++) {
+						path_trace_kernel(&kg, render_buffer, rng_state,
+						                  sample, x, y, tile.offset, tile.stride);
 					}
+				}
 
-					tile.sample = sample + 1;
+				tile.sample = sample + 1;
 
-					task.update_progress(&tile);
-				}
+				task.update_progress(&tile);
 			}
 
 			task.release_tile(tile);
@@ -320,110 +265,74 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
+			void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-															 sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx2())
+				convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
+			if(system_cpu_support_avx())
 				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
 			else
 #endif	
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse41())
+				convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
 			else
 #endif		
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
-			if(system_cpu_support_sse3()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse3())
+				convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse2())
+				convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
 			else
 #endif
-			{
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
+
+			for(int y = task.y; y < task.y + task.h; y++)
+				for(int x = task.x; x < task.x + task.w; x++)
+					convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+						sample_scale, x, y, task.offset, task.stride);
 		}
 		else {
+			void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-													   sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx2())
+				convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx())
+				convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
 			else
 #endif		
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse41())
+				convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
 			else
 #endif			
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse3())
+				convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse2())
+				convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
 			else
 #endif
-			{
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_byte_kernel = kernel_cpu_convert_to_byte;
+
+			for(int y = task.y; y < task.y + task.h; y++)
+				for(int x = task.x; x < task.x + task.w; x++)
+					convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+						sample_scale, x, y, task.offset, task.stride);
+
 		}
 	}
 
@@ -434,93 +343,45 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
+		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_avx2())
+			shader_kernel = kernel_cpu_avx2_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_avx())
+			shader_kernel = kernel_cpu_avx_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-		if(system_cpu_support_sse41()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse41())
+			shader_kernel = kernel_cpu_sse41_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse3())
+			shader_kernel = kernel_cpu_sse3_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse2())
+			shader_kernel = kernel_cpu_sse2_shader;
 		else
 #endif
-		{
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
+			shader_kernel = kernel_cpu_shader;
 
-				if(task.get_cancel() || task_pool.canceled())
-					break;
+		for(int sample = 0; sample < task.num_samples; sample++) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+				shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					task.shader_eval_type, x, task.offset, sample);
+
+			if(task.get_cancel() || task_pool.canceled())
+				break;
+
+			task.update_progress(NULL);
 
-				task.update_progress(NULL);
-			}
 		}
 
 #ifdef WITH_OSL
@@ -530,7 +391,7 @@ public:
 
 	int get_split_task_count(DeviceTask& task)
 	{
-		if (task.type == DeviceTask::SHADER)
+		if(task.type == DeviceTask::SHADER)
 			return task.get_subtask_count(TaskScheduler::num_threads(), 256);
 		else
 			return task.get_subtask_count(TaskScheduler::num_threads());
@@ -580,5 +441,17 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	devices.insert(devices.begin(), info);
 }
 
-CCL_NAMESPACE_END
+string device_cpu_capabilities(void)
+{
+	string capabilities = "";
+	capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+	capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+	capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+	capabilities += system_cpu_support_avx() ? "AVX " : "";
+	capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+	if(capabilities[capabilities.size() - 1] == ' ')
+		capabilities.resize(capabilities.size() - 1);
+	return capabilities;
+}
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 5de2efab8be..a47d4edeb56 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -25,9 +25,11 @@
 
 #include "cuew.h"
 #include "util_debug.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_opengl.h"
 #include "util_path.h"
+#include "util_string.h"
 #include "util_system.h"
 #include "util_types.h"
 #include "util_time.h"
@@ -76,7 +78,7 @@ public:
 	{
 		if(first_error) {
 			fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-			fprintf(stderr, "http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/GPU_Rendering\n\n");
+			fprintf(stderr, "http://www.blender.org/manual/render/cycles/gpu_rendering.html\n\n");
 			first_error = false;
 		}
 	}
@@ -183,7 +185,7 @@ public:
 		cuda_assert(cuCtxDestroy(cuContext));
 	}
 
-	bool support_device(bool experimental)
+	bool support_device(bool /*experimental*/)
 	{
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
@@ -202,15 +204,18 @@ public:
 		/* compute cubin name */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		string cubin;
 
 		/* attempt to use kernel provided with blender */
-		string cubin;
 		if(experimental)
 			cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
 		else
 			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
-		if(path_exists(cubin))
+		VLOG(1) << "Testing for pre-compiled kernel " << cubin;
+		if(path_exists(cubin)) {
+			VLOG(1) << "Using precompiled kernel";
 			return cubin;
+		}
 
 		/* not found, try to use locally compiled kernel */
 		string kernel_path = path_get("kernel");
@@ -221,10 +226,12 @@ public:
 		else
 			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
 		cubin = path_user_get(path_join("cache", cubin));
-
+		VLOG(1) << "Testing for locally compiled kernel " << cubin;
 		/* if exists already, use it */
-		if(path_exists(cubin))
+		if(path_exists(cubin)) {
+			VLOG(1) << "Using locally compiled kernel";
 			return cubin;
+		}
 
 #ifdef _WIN32
 		if(have_precompiled_kernels()) {
@@ -245,6 +252,7 @@ public:
 		}
 
 		int cuda_version = cuewCompilerVersion();
+		VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version;
 
 		if(cuda_version == 0) {
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
@@ -258,7 +266,7 @@ public:
 			printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* compile */
-		string kernel = path_join(kernel_path, "kernel.cu");
+		string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
 
@@ -268,11 +276,20 @@ public:
 		path_create_directories(cubin);
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			"-o \"%s\" --ptxas-options=\"-v\" --use_fast_math -I\"%s\" "
+			"-DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 		
 		if(experimental)
-			command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
+			command += " -D__KERNEL_EXPERIMENTAL__";
+
+		if(getenv("CYCLES_CUDA_EXTRA_CFLAGS")) {
+			command += string(" ") + getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+		}
+
+#ifdef WITH_CYCLES_DEBUG
+		command += " -D__KERNEL_DEBUG__";
+#endif
 
 		printf("%s\n", command.c_str());
 
@@ -292,18 +309,18 @@ public:
 		return cubin;
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		/* check if cuda init succeeded */
 		if(cuContext == 0)
 			return false;
 		
 		/* check if GPU is supported */
-		if(!support_device(experimental))
+		if(!support_device(requested_features.experimental))
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(experimental);
+		string cubin = compile_kernel(requested_features.experimental);
 
 		if(cubin == "")
 			return false;
@@ -314,7 +331,7 @@ public:
 		string cubin_data;
 		CUresult result;
 
-		if (path_read_text(cubin, cubin_data))
+		if(path_read_text(cubin, cubin_data))
 			result = cuModuleLoadData(&cuModule, cubin_data.c_str());
 		else
 			result = CUDA_ERROR_FILE_NOT_FOUND;
@@ -327,7 +344,7 @@ public:
 		return (result == CUDA_SUCCESS);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		cuda_push_context();
 		CUdeviceptr device_pointer;
@@ -355,7 +372,7 @@ public:
 		cuda_push_context();
 		if(mem.device_pointer) {
 			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
-			                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size));
+			                         (CUdeviceptr)(mem.device_pointer + offset), size));
 		}
 		else {
 			memset((char*)mem.data_pointer + offset, 0, size);
@@ -399,9 +416,13 @@ public:
 		cuda_pop_context();
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType interpolation,
+	               ExtensionType extension)
 	{
 		/* todo: support 3D textures, only CPU for now */
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
 		/* determine format */
 		CUarray_format_enum format;
@@ -466,7 +487,7 @@ public:
 				if(interpolation == INTERPOLATION_CLOSEST) {
 					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
 				}
-				else if (interpolation == INTERPOLATION_LINEAR) {
+				else if(interpolation == INTERPOLATION_LINEAR) {
 					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
 				}
 				else {/* CUBIC and SMART are unsupported for CUDA */
@@ -492,13 +513,19 @@ public:
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
 			}
 
-			if(periodic) {
-				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP));
-				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP));
-			}
-			else {
-				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP));
-				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP));
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP));
+					cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP));
+					break;
+				case EXTENSION_EXTEND:
+					cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP));
+					cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP));
+					break;
+				case EXTENSION_CLIP:
+					cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_BORDER));
+					cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_BORDER));
+					break;
 			}
 			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
 
@@ -862,11 +889,12 @@ public:
 		}
 	}
 
-	void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent,
+	void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
+			float *vpointer;
 
 			cuda_push_context();
 
@@ -900,23 +928,52 @@ public:
 				draw_params.bind_display_space_shader_cb();
 			}
 
-			glPushMatrix();
-			glTranslatef(0.0f, (float)dy, 0.0f);
-				
-			glBegin(GL_QUADS);
-			
-			glTexCoord2f(0.0f, 0.0f);
-			glVertex2f(0.0f, 0.0f);
-			glTexCoord2f((float)w/(float)pmem.w, 0.0f);
-			glVertex2f((float)width, 0.0f);
-			glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
-			glVertex2f((float)width, (float)height);
-			glTexCoord2f(0.0f, (float)h/(float)pmem.h);
-			glVertex2f(0.0f, (float)height);
+			if(!vertex_buffer)
+				glGenBuffers(1, &vertex_buffer);
+
+			glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+			/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+			glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
 
-			glEnd();
+			vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
 
-			glPopMatrix();
+			if(vpointer) {
+				/* texture coordinate - vertex pair */
+				vpointer[0] = 0.0f;
+				vpointer[1] = 0.0f;
+				vpointer[2] = dx;
+				vpointer[3] = dy;
+
+				vpointer[4] = (float)w/(float)pmem.w;
+				vpointer[5] = 0.0f;
+				vpointer[6] = (float)width + dx;
+				vpointer[7] = dy;
+
+				vpointer[8] = (float)w/(float)pmem.w;
+				vpointer[9] = (float)h/(float)pmem.h;
+				vpointer[10] = (float)width + dx;
+				vpointer[11] = (float)height + dy;
+
+				vpointer[12] = 0.0f;
+				vpointer[13] = (float)h/(float)pmem.h;
+				vpointer[14] = dx;
+				vpointer[15] = (float)height + dy;
+
+				glUnmapBuffer(GL_ARRAY_BUFFER);
+			}
+
+			glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), 0);
+			glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), (char *)NULL + 2 * sizeof(float));
+
+			glEnableClientState(GL_VERTEX_ARRAY);
+			glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+
+			glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+			glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+			glDisableClientState(GL_VERTEX_ARRAY);
+
+			glBindBuffer(GL_ARRAY_BUFFER, 0);
 
 			if(draw_params.unbind_display_space_shader_cb) {
 				draw_params.unbind_display_space_shader_cb();
@@ -933,7 +990,7 @@ public:
 			return;
 		}
 
-		Device::draw_pixels(mem, y, w, h, dy, width, height, transparent, draw_params);
+		Device::draw_pixels(mem, y, w, h, dx, dy, width, height, transparent, draw_params);
 	}
 
 	void thread_run(DeviceTask *task)
@@ -949,7 +1006,7 @@ public:
 				int end_sample = tile.start_sample + tile.num_samples;
 
 				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task->get_cancel()) {
+					if(task->get_cancel()) {
 						if(task->need_finish_queue == false)
 							break;
 					}
@@ -982,7 +1039,7 @@ public:
 		}
 	};
 
-	int get_split_task_count(DeviceTask& task)
+	int get_split_task_count(DeviceTask& /*task*/)
 	{
 		return 1;
 	}
@@ -1018,19 +1075,34 @@ bool device_cuda_init(void)
 	static bool initialized = false;
 	static bool result = false;
 
-	if (initialized)
+	if(initialized)
 		return result;
 
 	initialized = true;
-
-	if (cuewInit() == CUEW_SUCCESS) {
-		if(CUDADevice::have_precompiled_kernels())
+	int cuew_result = cuewInit();
+	if(cuew_result == CUEW_SUCCESS) {
+		VLOG(1) << "CUEW initialization succeeded";
+		if(CUDADevice::have_precompiled_kernels()) {
+			VLOG(1) << "Found precompiled  kernels";
 			result = true;
+		}
 #ifndef _WIN32
-		else if(cuewCompilerPath() != NULL)
+		else if(cuewCompilerPath() != NULL) {
+			VLOG(1) << "Found CUDA compiled " << cuewCompilerPath();
 			result = true;
+		}
+		else {
+			VLOG(1) << "Neither precompiled kernels nor CUDA compiler wad found,"
+			        << " unable to use CUDA";
+		}
 #endif
 	}
+	else {
+		VLOG(1) << "CUEW initialization failed: "
+		        << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
+		            ? "Error setting up atexit() handler"
+		            : "Error opening the library");
+	}
 
 	return result;
 }
@@ -1059,14 +1131,20 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 	}
 	
 	vector<DeviceInfo> display_devices;
-	
+
 	for(int num = 0; num < count; num++) {
 		char name[256];
 		int attr;
-		
+
 		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
 			continue;
 
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, num);
+		if(major < 2) {
+			continue;
+		}
+
 		DeviceInfo info;
 
 		info.type = DEVICE_CUDA;
@@ -1074,8 +1152,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.id = string_printf("CUDA_%d", num);
 		info.num = num;
 
-		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, num);
 		info.advanced_shading = (major >= 2);
 		info.extended_images = (major >= 3);
 		info.pack_images = false;
@@ -1093,5 +1169,135 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		devices.insert(devices.end(), display_devices.begin(), display_devices.end());
 }
 
-CCL_NAMESPACE_END
+string device_cuda_capabilities(void)
+{
+	CUresult result = cuInit(0);
+	if(result != CUDA_SUCCESS) {
+		if(result != CUDA_ERROR_NO_DEVICE) {
+			return string("Error initializing CUDA: ") + cuewErrorString(result);
+		}
+		return "No CUDA device found\n";
+	}
+
+	int count;
+	result = cuDeviceGetCount(&count);
+	if(result != CUDA_SUCCESS) {
+		return string("Error getting devices: ") + cuewErrorString(result);
+	}
 
+	string capabilities = "";
+	for(int num = 0; num < count; num++) {
+		char name[256];
+		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
+			continue;
+		}
+		capabilities += string("\t") + name + "\n";
+		int value;
+#define GET_ATTR(attr) \
+		{ \
+			if(cuDeviceGetAttribute(&value, \
+			                        CU_DEVICE_ATTRIBUTE_##attr, \
+			                        num) == CUDA_SUCCESS) \
+			{ \
+				capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
+				                              value); \
+			} \
+		} (void)0
+		/* TODO(sergey): Strip all attributes which are not useful for us
+		 * or does not depend on the driver.
+		 */
+		GET_ATTR(MAX_THREADS_PER_BLOCK);
+		GET_ATTR(MAX_BLOCK_DIM_X);
+		GET_ATTR(MAX_BLOCK_DIM_Y);
+		GET_ATTR(MAX_BLOCK_DIM_Z);
+		GET_ATTR(MAX_GRID_DIM_X);
+		GET_ATTR(MAX_GRID_DIM_Y);
+		GET_ATTR(MAX_GRID_DIM_Z);
+		GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
+		GET_ATTR(SHARED_MEMORY_PER_BLOCK);
+		GET_ATTR(TOTAL_CONSTANT_MEMORY);
+		GET_ATTR(WARP_SIZE);
+		GET_ATTR(MAX_PITCH);
+		GET_ATTR(MAX_REGISTERS_PER_BLOCK);
+		GET_ATTR(REGISTERS_PER_BLOCK);
+		GET_ATTR(CLOCK_RATE);
+		GET_ATTR(TEXTURE_ALIGNMENT);
+		GET_ATTR(GPU_OVERLAP);
+		GET_ATTR(MULTIPROCESSOR_COUNT);
+		GET_ATTR(KERNEL_EXEC_TIMEOUT);
+		GET_ATTR(INTEGRATED);
+		GET_ATTR(CAN_MAP_HOST_MEMORY);
+		GET_ATTR(COMPUTE_MODE);
+		GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
+		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
+		GET_ATTR(SURFACE_ALIGNMENT);
+		GET_ATTR(CONCURRENT_KERNELS);
+		GET_ATTR(ECC_ENABLED);
+		GET_ATTR(TCC_DRIVER);
+		GET_ATTR(MEMORY_CLOCK_RATE);
+		GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
+		GET_ATTR(L2_CACHE_SIZE);
+		GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
+		GET_ATTR(ASYNC_ENGINE_COUNT);
+		GET_ATTR(UNIFIED_ADDRESSING);
+		GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
+		GET_ATTR(CAN_TEX2D_GATHER);
+		GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
+		GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
+		GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
+		GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
+		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
+		GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
+		GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
+		GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
+		GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
+		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
+		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
+		GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
+		GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
+		GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
+		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
+		GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
+		GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
+		GET_ATTR(COMPUTE_CAPABILITY_MINOR);
+		GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
+		GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
+		GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
+		GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
+		GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+		GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
+		GET_ATTR(MANAGED_MEMORY);
+		GET_ATTR(MULTI_GPU_BOARD);
+		GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
+#undef GET_ATTR
+		capabilities += "\n";
+	}
+
+	return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 80f1e2441a5..47584ae6d22 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __DEVICE_INTERN_H__
@@ -35,6 +35,10 @@ void device_cuda_info(vector<DeviceInfo>& devices);
 void device_network_info(vector<DeviceInfo>& devices);
 void device_multi_info(vector<DeviceInfo>& devices);
 
+string device_cpu_capabilities(void);
+string device_opencl_capabilities(void);
+string device_cuda_capabilities(void);
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 8eee6a2c79e..ba79f8c88ae 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __DEVICE_MEMORY_H__
@@ -212,11 +212,14 @@ public:
 	{
 		data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
 		data.resize(data_size);
-		data_pointer = (device_ptr)&data[0];
 		data_width = width;
 		data_height = height;
 		data_depth = depth;
-
+		if(data_size == 0) {
+			data_pointer = 0;
+			return NULL;
+		}
+		data_pointer = (device_ptr)&data[0];
 		return &data[0];
 	}
 
@@ -260,6 +263,11 @@ public:
 		return data.size();
 	}
 
+	T* get_data()
+	{
+		return &data[0];
+	}
+
 private:
 	array<T> data;
 };
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 7f055c79491..8fb841b2b0d 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -25,6 +25,7 @@
 
 #include "util_foreach.h"
 #include "util_list.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_time.h"
 
@@ -88,10 +89,10 @@ public:
 		return error_msg;
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		foreach(SubDevice& sub, devices)
-			if(!sub.device->load_kernels(experimental))
+			if(!sub.device->load_kernels(requested_features))
 				return false;
 
 		return true;
@@ -168,11 +169,17 @@ public:
 			sub.device->const_copy_to(name, host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType
+	               interpolation,
+	               ExtensionType extension)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->tex_alloc(name, mem, interpolation, periodic);
+			sub.device->tex_alloc(name, mem, interpolation, extension);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
@@ -233,7 +240,7 @@ public:
 		mem.device_pointer = tmp;
 	}
 
-	void draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+	void draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
 		device_ptr tmp = rgba.device_pointer;
@@ -248,7 +255,7 @@ public:
 			/* adjust math for w/width */
 
 			rgba.device_pointer = sub.ptr_map[tmp];
-			sub.device->draw_pixels(rgba, sy, w, sh, sdy, width, sheight, transparent, draw_params);
+			sub.device->draw_pixels(rgba, sy, w, sh, dx, sdy, width, sheight, transparent, draw_params);
 			i++;
 		}
 
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index dca9bf29e70..afa35224aba 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -19,6 +19,7 @@
 #include "device_network.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 #if defined(WITH_NETWORK)
 
@@ -162,8 +163,13 @@ public:
 		snd.write_buffer(host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType interpolation,
+	               ExtensionType extension)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
@@ -175,7 +181,7 @@ public:
 		snd.add(name_string);
 		snd.add(mem);
 		snd.add(interpolation);
-		snd.add(periodic);
+		snd.add(extension);
 		snd.write();
 		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
 	}
@@ -194,7 +200,7 @@ public:
 		}
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		if(error_func.have_error())
 			return false;
@@ -202,7 +208,10 @@ public:
 		thread_scoped_lock lock(rpc_lock);
 
 		RPCSend snd(socket, &error_func, "load_kernels");
-		snd.add(experimental);
+		snd.add(requested_features.experimental);
+		snd.add(requested_features.max_closure);
+		snd.add(requested_features.max_nodes_group);
+		snd.add(requested_features.nodes_features);
 		snd.write();
 
 		bool result;
@@ -269,7 +278,7 @@ public:
 				lock.unlock();
 
 				TileList::iterator it = tile_list_find(the_tiles, tile);
-				if (it != the_tiles.end()) {
+				if(it != the_tiles.end()) {
 					tile.buffers = it->buffers;
 					the_tiles.erase(it);
 				}
@@ -565,13 +574,13 @@ protected:
 			network_device_memory mem;
 			string name;
 			InterpolationType interpolation;
-			bool periodic;
+			ExtensionType extension_type;
 			device_ptr client_pointer;
 
 			rcv.read(name);
 			rcv.read(mem);
 			rcv.read(interpolation);
-			rcv.read(periodic);
+			rcv.read(extension_type);
 			lock.unlock();
 
 			client_pointer = mem.device_pointer;
@@ -587,7 +596,7 @@ protected:
 
 			rcv.read_buffer((uint8_t*)mem.data_pointer, data_size);
 
-			device->tex_alloc(name.c_str(), mem, interpolation, periodic);
+			device->tex_alloc(name.c_str(), mem, interpolation, extension_type);
 
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
 		}
@@ -605,11 +614,14 @@ protected:
 			device->tex_free(mem);
 		}
 		else if(rcv.name == "load_kernels") {
-			bool experimental;
-			rcv.read(experimental);
+			DeviceRequestedFeatures requested_features;
+			rcv.read(requested_features.experimental);
+			rcv.read(requested_features.max_closure);
+			rcv.read(requested_features.max_nodes_group);
+			rcv.read(requested_features.nodes_features);
 
 			bool result;
-			result = device->load_kernels(experimental);
+			result = device->load_kernels(requested_features);
 			RPCSend snd(socket, &error_func, "load_kernels");
 			snd.add(result);
 			snd.write();
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 893841d1da7..2e751f6697f 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __DEVICE_NETWORK_H__
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index d950d084cd4..a7157e2b041 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifdef WITH_OPENCL
@@ -20,14 +20,15 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "clew.h"
+
 #include "device.h"
 #include "device_intern.h"
 
 #include "buffers.h"
 
-#include "clew.h"
-
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_math.h"
 #include "util_md5.h"
@@ -39,11 +40,55 @@ CCL_NAMESPACE_BEGIN
 
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
-static cl_device_type opencl_device_type()
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+struct OpenCLPlatformDevice {
+	OpenCLPlatformDevice(cl_platform_id platform_id,
+	                     const string& platform_name,
+	                     cl_device_id device_id,
+	                     cl_device_type device_type,
+	                     const string& device_name)
+	  : platform_id(platform_id),
+	    platform_name(platform_name),
+	    device_id(device_id),
+	    device_type(device_type),
+	    device_name(device_name) {}
+	cl_platform_id platform_id;
+	string platform_name;
+	cl_device_id device_id;
+	cl_device_type device_type;
+	string device_name;
+};
+
+namespace {
+
+cl_device_type opencl_device_type()
 {
 	char *device = getenv("CYCLES_OPENCL_TEST");
 
 	if(device) {
+		if(strcmp(device, "NONE") == 0)
+			return 0;
 		if(strcmp(device, "ALL") == 0)
 			return CL_DEVICE_TYPE_ALL;
 		else if(strcmp(device, "DEFAULT") == 0)
@@ -59,68 +104,277 @@ static cl_device_type opencl_device_type()
 	return CL_DEVICE_TYPE_ALL;
 }
 
-static bool opencl_kernel_use_debug()
+bool opencl_kernel_use_debug()
 {
 	return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
 }
 
-static bool opencl_kernel_use_advanced_shading(const string& platform)
+bool opencl_kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
 	if(platform == "NVIDIA CUDA")
 		return true;
 	else if(platform == "Apple")
-		return false;
+		return true;
 	else if(platform == "AMD Accelerated Parallel Processing")
-		return false;
+		return true;
 	else if(platform == "Intel(R) OpenCL")
 		return true;
+	/* Make sure officially unsupported OpenCL platforms
+	 * does not set up to use advanced shading.
+	 */
+	return false;
+}
 
+bool opencl_kernel_use_split(const string& platform_name,
+                             const cl_device_type device_type)
+{
+	if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
+		return true;
+	}
+	/* TODO(sergey): Replace string lookups with more enum-like API,
+	 * similar to device/vendor checks blender's gpu.
+	 */
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
 	return false;
 }
 
-static string opencl_kernel_build_options(const string& platform, const string *debug_src = NULL)
+bool opencl_device_supported(const string& platform_name,
+                             const cl_device_id device_id)
 {
-	string build_options = " -cl-fast-relaxed-math ";
+	cl_device_type device_type;
+	clGetDeviceInfo(device_id,
+	                CL_DEVICE_TYPE,
+	                sizeof(cl_device_type),
+	                &device_type,
+	                NULL);
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
+		return true;
+	}
+	return false;
+}
 
-	if(platform == "NVIDIA CUDA")
-		build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
+bool opencl_platform_version_check(cl_platform_id platform,
+                                   string *error = NULL)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetPlatformInfo(platform,
+	                  CL_PLATFORM_VERSION,
+	                  sizeof(version),
+	                  &version,
+	                  NULL);
+	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
 
-	else if(platform == "Apple")
-		build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+bool opencl_device_version_check(cl_device_id device,
+                                 string *error = NULL)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetDeviceInfo(device,
+	                CL_DEVICE_OPENCL_C_VERSION,
+	                sizeof(version),
+	                &version,
+	                NULL);
+	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
 
-	else if(platform == "AMD Accelerated Parallel Processing")
-		build_options += "-D__KERNEL_OPENCL_AMD__ ";
+void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
+{
+	const bool force_all_platforms =
+	        (getenv("CYCLES_OPENCL_TEST") != NULL) ||
+	        (getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST")) != NULL;
+	const cl_device_type device_type = opencl_device_type();
+	static bool first_time = true;
+#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
+
+	usable_devices->clear();
+
+	if(device_type == 0) {
+		FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
+		first_time = false;
+		return;
+	}
 
-	else if(platform == "Intel(R) OpenCL") {
-		build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
+	vector<cl_device_id> device_ids;
+	cl_uint num_devices = 0;
+	vector<cl_platform_id> platform_ids;
+	cl_uint num_platforms = 0;
 
-		/* options for gdb source level kernel debugging. this segfaults on linux currently */
-		if(opencl_kernel_use_debug() && debug_src)
-			build_options += "-g -s \"" + *debug_src + "\"";
+	/* Get devices. */
+	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
+	   num_platforms == 0)
+	{
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+		first_time = false;
+		return;
 	}
-
-	if(opencl_kernel_use_debug())
-		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-	
-	return build_options;
+	platform_ids.resize(num_platforms);
+	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
+		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+		first_time = false;
+		return;
+	}
+	/* Devices are numbered consecutively across platforms. */
+	for(int platform = 0; platform < num_platforms; platform++) {
+		cl_platform_id platform_id = platform_ids[platform];
+		char pname[256];
+		if(clGetPlatformInfo(platform_id,
+		                     CL_PLATFORM_NAME,
+		                     sizeof(pname),
+		                     &pname,
+		                     NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
+			continue;
+		}
+		string platform_name = pname;
+		FIRST_VLOG(2) << "Enumerating devices for platform "
+		              << platform_name << ".";
+		if(!opencl_platform_version_check(platform_id)) {
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << " due to too old compiler version.";
+			continue;
+		}
+		num_devices = 0;
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  0,
+		                  NULL,
+		                  &num_devices) != CL_SUCCESS || num_devices == 0)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch number of devices.";
+			continue;
+		}
+		device_ids.resize(num_devices);
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  num_devices,
+		                  &device_ids[0],
+		                  NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch devices list.";
+			continue;
+		}
+		for(int num = 0; num < num_devices; num++) {
+			cl_device_id device_id = device_ids[num];
+			char device_name[1024] = "\0";
+			if(clGetDeviceInfo(device_id,
+			                   CL_DEVICE_NAME,
+			                   sizeof(device_name),
+			                   &device_name,
+			                   NULL) != CL_SUCCESS)
+			{
+				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+				continue;
+			}
+			if(!opencl_device_version_check(device_id)) {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << " due to old compiler version.";
+				continue;
+			}
+			if(force_all_platforms ||
+			   opencl_device_supported(platform_name, device_id))
+			{
+				cl_device_type device_type;
+				if(clGetDeviceInfo(device_id,
+				                   CL_DEVICE_TYPE,
+				                   sizeof(cl_device_type),
+				                   &device_type,
+				                   NULL) != CL_SUCCESS)
+				{
+					FIRST_VLOG(2) << "Ignoring device " << device_name
+					              << ", failed to fetch device type.";
+					continue;
+				}
+				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+				                                               platform_name,
+				                                               device_id,
+				                                               device_type,
+				                                               device_name));
+			}
+			else {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << ", not officially supported yet.";
+			}
+		}
+	}
+	first_time = false;
 }
 
-/* thread safe cache for contexts and programs */
+}  /* namespace */
+
+/* Thread safe cache for contexts and programs.
+ *
+ * TODO(sergey): Make it more generous, so it can contain any type of program
+ * without hardcoding possible program types in the slot.
+ */
 class OpenCLCache
 {
 	struct Slot
 	{
 		thread_mutex *mutex;
 		cl_context context;
-		cl_program program;
-
-		Slot() : mutex(NULL), context(NULL), program(NULL) {}
-
-		Slot(const Slot &rhs)
-			: mutex(rhs.mutex)
-			, context(rhs.context)
-			, program(rhs.program)
+		/* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
+		cl_program ocl_dev_base_program;
+		/* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
+		cl_program ocl_dev_megakernel_program;
+
+		Slot() : mutex(NULL),
+		         context(NULL),
+		         ocl_dev_base_program(NULL),
+		         ocl_dev_megakernel_program(NULL) {}
+
+		Slot(const Slot& rhs)
+		    : mutex(rhs.mutex),
+		      context(rhs.context),
+		      ocl_dev_base_program(rhs.ocl_dev_base_program),
+		      ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
 		{
 			/* copy can only happen in map insert, assert that */
 			assert(mutex == NULL);
@@ -167,12 +421,14 @@ class OpenCLCache
 	 * will be holding a lock for the cache. slot_locker should refer to a
 	 * default constructed thread_scoped_lock */
 	template<typename T>
-	static T get_something(cl_platform_id platform, cl_device_id device,
-		T Slot::*member, thread_scoped_lock &slot_locker)
+	static T get_something(cl_platform_id platform,
+	                       cl_device_id device,
+	                       T Slot::*member,
+	                       thread_scoped_lock& slot_locker)
 	{
 		assert(platform != NULL);
 
-		OpenCLCache &self = global_instance();
+		OpenCLCache& self = global_instance();
 
 		thread_scoped_lock cache_lock(self.cache_lock);
 
@@ -205,8 +461,11 @@ class OpenCLCache
 
 	/* store something in the cache. you MUST have tried to get the item before storing to it */
 	template<typename T>
-	static void store_something(cl_platform_id platform, cl_device_id device, T thing,
-		T Slot::*member, thread_scoped_lock &slot_locker)
+	static void store_something(cl_platform_id platform,
+	                            cl_device_id device,
+	                            T thing,
+	                            T Slot::*member,
+	                            thread_scoped_lock& slot_locker)
 	{
 		assert(platform != NULL);
 		assert(device != NULL);
@@ -231,11 +490,21 @@ class OpenCLCache
 	}
 
 public:
+
+	enum ProgramName {
+		OCL_DEV_BASE_PROGRAM,
+		OCL_DEV_MEGAKERNEL_PROGRAM,
+	};
+
 	/* see get_something comment */
-	static cl_context get_context(cl_platform_id platform, cl_device_id device,
-		thread_scoped_lock &slot_locker)
+	static cl_context get_context(cl_platform_id platform,
+	                              cl_device_id device,
+	                              thread_scoped_lock& slot_locker)
 	{
-		cl_context context = get_something<cl_context>(platform, device, &Slot::context, slot_locker);
+		cl_context context = get_something<cl_context>(platform,
+		                                               device,
+		                                               &Slot::context,
+		                                               slot_locker);
 
 		if(!context)
 			return NULL;
@@ -249,10 +518,31 @@ public:
 	}
 
 	/* see get_something comment */
-	static cl_program get_program(cl_platform_id platform, cl_device_id device,
-		thread_scoped_lock &slot_locker)
+	static cl_program get_program(cl_platform_id platform,
+	                              cl_device_id device,
+	                              ProgramName program_name,
+	                              thread_scoped_lock& slot_locker)
 	{
-		cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
+		cl_program program = NULL;
+
+		switch(program_name) {
+			case OCL_DEV_BASE_PROGRAM:
+				/* Get program related to OpenCLDeviceBase */
+				program = get_something<cl_program>(platform,
+				                                    device,
+				                                    &Slot::ocl_dev_base_program,
+				                                    slot_locker);
+				break;
+			case OCL_DEV_MEGAKERNEL_PROGRAM:
+				/* Get program related to megakernel */
+				program = get_something<cl_program>(platform,
+				                                    device,
+				                                    &Slot::ocl_dev_megakernel_program,
+				                                    slot_locker);
+				break;
+		default:
+			assert(!"Invalid program name");
+		}
 
 		if(!program)
 			return NULL;
@@ -266,10 +556,16 @@ public:
 	}
 
 	/* see store_something comment */
-	static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
-		thread_scoped_lock &slot_locker)
+	static void store_context(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_context context,
+	                          thread_scoped_lock& slot_locker)
 	{
-		store_something<cl_context>(platform, device, context, &Slot::context, slot_locker);
+		store_something<cl_context>(platform,
+		                            device,
+		                            context,
+		                            &Slot::context,
+		                            slot_locker);
 
 		/* increment reference count in OpenCL.
 		 * The caller is going to release the object when done with it. */
@@ -279,28 +575,51 @@ public:
 	}
 
 	/* see store_something comment */
-	static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
-		thread_scoped_lock &slot_locker)
+	static void store_program(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_program program,
+	                          ProgramName program_name,
+	                          thread_scoped_lock& slot_locker)
 	{
-		store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
+		switch (program_name) {
+			case OCL_DEV_BASE_PROGRAM:
+				store_something<cl_program>(platform,
+				                            device,
+				                            program,
+				                            &Slot::ocl_dev_base_program,
+				                            slot_locker);
+				break;
+			case OCL_DEV_MEGAKERNEL_PROGRAM:
+				store_something<cl_program>(platform,
+				                            device,
+				                            program,
+				                            &Slot::ocl_dev_megakernel_program,
+				                            slot_locker);
+				break;
+			default:
+				assert(!"Invalid program name\n");
+				return;
+		}
 
-		/* increment reference count in OpenCL.
-		 * The caller is going to release the object when done with it. */
+		/* Increment reference count in OpenCL.
+		 * The caller is going to release the object when done with it.
+		 */
 		cl_int ciErr = clRetainProgram(program);
 		assert(ciErr == CL_SUCCESS);
 		(void)ciErr;
 	}
 
-	/* discard all cached contexts and programs
-	 * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
+	/* Discard all cached contexts and programs.  */
 	static void flush()
 	{
 		OpenCLCache &self = global_instance();
 		thread_scoped_lock cache_lock(self.cache_lock);
 
 		foreach(CacheMap::value_type &item, self.cache) {
-			if(item.second.program != NULL)
-				clReleaseProgram(item.second.program);
+			if(item.second.ocl_dev_base_program != NULL)
+				clReleaseProgram(item.second.ocl_dev_base_program);
+			if(item.second.ocl_dev_megakernel_program != NULL)
+				clReleaseProgram(item.second.ocl_dev_megakernel_program);
 			if(item.second.context != NULL)
 				clReleaseContext(item.second.context);
 		}
@@ -309,7 +628,7 @@ public:
 	}
 };
 
-class OpenCLDevice : public Device
+class OpenCLDeviceBase : public Device
 {
 public:
 	DedicatedTaskPool task_pool;
@@ -318,7 +637,6 @@ public:
 	cl_platform_id cpPlatform;
 	cl_device_id cdDevice;
 	cl_program cpProgram;
-	cl_kernel ckPathTraceKernel;
 	cl_kernel ckFilmConvertByteKernel;
 	cl_kernel ckFilmConvertHalfFloatKernel;
 	cl_kernel ckShaderKernel;
@@ -380,7 +698,7 @@ public:
 		}
 	}
 
-	OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
+	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_)
 	{
 		cpPlatform = NULL;
@@ -388,7 +706,6 @@ public:
 		cxContext = NULL;
 		cqCommandQueue = NULL;
 		cpProgram = NULL;
-		ckPathTraceKernel = NULL;
 		ckFilmConvertByteKernel = NULL;
 		ckFilmConvertHalfFloatKernel = NULL;
 		ckShaderKernel = NULL;
@@ -396,71 +713,20 @@ public:
 		null_mem = 0;
 		device_initialized = false;
 
-		/* setup platform */
-		cl_uint num_platforms;
-
-		ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
-		if(opencl_error(ciErr))
-			return;
-
-		if(num_platforms == 0) {
-			opencl_error("OpenCL: no platforms found.");
-			return;
-		}
-
-		vector<cl_platform_id> platforms(num_platforms, NULL);
-
-		ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
-		if(opencl_error(ciErr)) {
-			fprintf(stderr, "clGetPlatformIDs failed \n");
-			return;
-		}
-
-		int num_base = 0;
-		int total_devices = 0;
-
-		for (int platform = 0; platform < num_platforms; platform++) {
-			cl_uint num_devices;
-
-			if(opencl_error(clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices)))
-				return;
-
-			total_devices += num_devices;
-
-			if(info.num - num_base >= num_devices) {
-				/* num doesn't refer to a device in this platform */
-				num_base += num_devices;
-				continue;
-			}
-
-			/* device is in this platform */
-			cpPlatform = platforms[platform];
-
-			/* get devices */
-			vector<cl_device_id> device_ids(num_devices, NULL);
-
-			if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) {
-				fprintf(stderr, "clGetDeviceIDs failed \n");
-				return;
-			}
-
-			cdDevice = device_ids[info.num - num_base];
-
-			char name[256];
-			clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
-			platform_name = name;
-
-			break;
-		}
-
-		if(total_devices == 0) {
+		vector<OpenCLPlatformDevice> usable_devices;
+		opencl_get_usable_devices(&usable_devices);
+		if(usable_devices.size() == 0) {
 			opencl_error("OpenCL: no devices found.");
 			return;
 		}
-		else if(!cdDevice) {
-			opencl_error("OpenCL: specified device not found.");
-			return;
-		}
+		assert(info.num < usable_devices.size());
+		OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+		cpPlatform = platform_device.platform_id;
+		cdDevice = platform_device.device_id;
+		platform_name = platform_device.platform_name;
+		VLOG(2) << "Creating new Cycles device for OpenCL platform "
+		        << platform_name << ", device "
+		        << platform_device.device_name << ".";
 
 		{
 			/* try to use cached context */
@@ -496,12 +762,12 @@ public:
 		if(opencl_error(ciErr))
 			return;
 
-		fprintf(stderr,"Device init succes\n");
+		fprintf(stderr, "Device init success\n");
 		device_initialized = true;
 	}
 
 	static void CL_CALLBACK context_notify_callback(const char *err_info,
-		const void *private_info, size_t cb, void *user_data)
+		const void * /*private_info*/, size_t /*cb*/, void *user_data)
 	{
 		char name[256];
 		clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
@@ -511,38 +777,23 @@ public:
 
 	bool opencl_version_check()
 	{
-		char version[256];
-
-		int major, minor, req_major = 1, req_minor = 1;
-
-		clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-
-		if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-			opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
+		string error;
+		if(!opencl_platform_version_check(cpPlatform, &error)) {
+			opencl_error(error);
 			return false;
 		}
-
-		if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-			opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
-			return false;
-		}
-
-		clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-
-		if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
-			opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
+		if(!opencl_device_version_check(cdDevice, &error)) {
+			opencl_error(error);
 			return false;
 		}
-
-		if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-			opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
-			return false;
-		}
-
 		return true;
 	}
 
-	bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
+	bool load_binary(const string& /*kernel_path*/,
+	                 const string& clbin,
+	                 string custom_kernel_build_options,
+	                 cl_program *program,
+	                 const string *debug_src = NULL)
 	{
 		/* read binary into memory */
 		vector<uint8_t> binary;
@@ -557,7 +808,7 @@ public:
 		size_t size = binary.size();
 		const uint8_t *bytes = &binary[0];
 
-		cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
+		*program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
 			&size, &bytes, &status, &ciErr);
 
 		if(opencl_error(status) || opencl_error(ciErr)) {
@@ -565,16 +816,16 @@ public:
 			return false;
 		}
 
-		if(!build_kernel(kernel_path, debug_src))
+		if(!build_kernel(program, custom_kernel_build_options, debug_src))
 			return false;
 
 		return true;
 	}
 
-	bool save_binary(const string& clbin)
+	bool save_binary(cl_program *program, const string& clbin)
 	{
 		size_t size = 0;
-		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+		clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
 
 		if(!size)
 			return false;
@@ -582,7 +833,7 @@ public:
 		vector<uint8_t> binary(size);
 		uint8_t *bytes = &binary[0];
 
-		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+		clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
 
 		if(!path_write_binary(clbin, binary)) {
 			opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
@@ -592,24 +843,30 @@ public:
 		return true;
 	}
 
-	bool build_kernel(const string& kernel_path, const string *debug_src = NULL)
+	bool build_kernel(cl_program *kernel_program,
+	                  string custom_kernel_build_options,
+	                  const string *debug_src = NULL)
 	{
-		string build_options = opencl_kernel_build_options(platform_name, debug_src);
-	
-		ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
+		string build_options;
+		build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
+
+		ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
 
 		/* show warnings even if build is successful */
 		size_t ret_val_size = 0;
 
-		clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+		clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
 
 		if(ret_val_size > 1) {
-			vector<char> build_log(ret_val_size+1);
-			clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+			vector<char> build_log(ret_val_size + 1);
+			clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
 
 			build_log[ret_val_size] = '\0';
-			fprintf(stderr, "OpenCL kernel build output:\n");
-			fprintf(stderr, "%s\n", &build_log[0]);
+			/* Skip meaningless empty output from the NVidia compiler. */
+			if(!(ret_val_size == 2 && build_log[0] == '\n')) {
+				fprintf(stderr, "OpenCL kernel build output:\n");
+				fprintf(stderr, "%s\n", &build_log[0]);
+			}
 		}
 
 		if(ciErr != CL_SUCCESS) {
@@ -620,12 +877,15 @@ public:
 		return true;
 	}
 
-	bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
+	bool compile_kernel(const string& kernel_path,
+	                    string source,
+	                    string custom_kernel_build_options,
+	                    cl_program *kernel_program,
+	                    const string *debug_src = NULL)
 	{
 		/* we compile kernels consisting of many files. unfortunately opencl
 		 * kernel caches do not seem to recognize changes in included files.
 		 * so we force recompile on changes by adding the md5 hash of all files */
-		string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
 		source = path_source_replace_includes(source, kernel_path);
 
 		if(debug_src)
@@ -634,15 +894,19 @@ public:
 		size_t source_len = source.size();
 		const char *source_str = source.c_str();
 
-		cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
+		*kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
 
 		if(opencl_error(ciErr))
 			return false;
 
 		double starttime = time_dt();
 		printf("Compiling OpenCL kernel ...\n");
+		/* TODO(sergey): Report which kernel is being compiled
+		 * as well (megakernel or which of split kernels etc..).
+		 */
+		printf("Build flags: %s\n", custom_kernel_build_options.c_str());
 
-		if(!build_kernel(kernel_path, debug_src))
+		if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
 			return false;
 
 		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
@@ -650,7 +914,7 @@ public:
 		return true;
 	}
 
-	string device_md5_hash()
+	string device_md5_hash(string kernel_custom_build_options = "")
 	{
 		MD5Hash md5;
 		char version[256], driver[256], name[256], vendor[256];
@@ -665,90 +929,120 @@ public:
 		md5.append((uint8_t*)name, strlen(name));
 		md5.append((uint8_t*)driver, strlen(driver));
 
-		string options = opencl_kernel_build_options(platform_name);
+		string options = kernel_build_options();
+		options += kernel_custom_build_options;
 		md5.append((uint8_t*)options.c_str(), options.size());
 
 		return md5.get_hex();
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
-		/* verify if device was initialized */
+		/* Verify if device was initialized. */
 		if(!device_initialized) {
 			fprintf(stderr, "OpenCL: failed to initialize device.\n");
 			return false;
 		}
 
-		/* try to use cached kernel */
+		/* Try to use cached kernel. */
 		thread_scoped_lock cache_locker;
-		cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
+		cpProgram = load_cached_kernel(requested_features,
+		                               OpenCLCache::OCL_DEV_BASE_PROGRAM,
+		                               cache_locker);
 
 		if(!cpProgram) {
-			/* verify we have right opencl version */
+			VLOG(2) << "No cached OpenCL kernel.";
+
+			/* Verify we have right opencl version. */
 			if(!opencl_version_check())
 				return false;
 
-			/* md5 hash to detect changes */
+			string build_flags = build_options_for_base_program(requested_features);
+
+			/* Calculate md5 hashes to detect changes. */
 			string kernel_path = path_get("kernel");
 			string kernel_md5 = path_files_md5_hash(kernel_path);
-			string device_md5 = device_md5_hash();
-
-			/* path to cached binary */
-			string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
+			string device_md5 = device_md5_hash(build_flags);
+
+			/* Path to cached binary.
+			 *
+			 * TODO(sergey): Seems we could de-duplicate all this string_printf()
+			 * calls with some utility function which will give file name for a
+			 * given hashes..
+			 */
+			string clbin = string_printf("cycles_kernel_%s_%s.clbin",
+			                             device_md5.c_str(),
+			                             kernel_md5.c_str());
 			clbin = path_user_get(path_join("cache", clbin));
 
 			/* path to preprocessed source for debugging */
 			string clsrc, *debug_src = NULL;
 
 			if(opencl_kernel_use_debug()) {
-				clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
+				clsrc = string_printf("cycles_kernel_%s_%s.cl",
+				                      device_md5.c_str(),
+				                      kernel_md5.c_str());
 				clsrc = path_user_get(path_join("cache", clsrc));
 				debug_src = &clsrc;
 			}
 
-			/* if exists already, try use it */
-			if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
-				/* kernel loaded from binary */
+			/* If binary kernel exists already, try use it. */
+			if(path_exists(clbin) && load_binary(kernel_path,
+			                                     clbin,
+			                                     build_flags,
+			                                     &cpProgram)) {
+				/* Kernel loaded from binary, nothing to do. */
+				VLOG(2) << "Loaded kernel from " << clbin << ".";
 			}
 			else {
-				/* if does not exist or loading binary failed, compile kernel */
-				if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+				VLOG(2) << "Kernel file " << clbin << " either doesn't exist or failed to be loaded by driver.";
+				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
+
+				/* If does not exist or loading binary failed, compile kernel. */
+				if(!compile_kernel(kernel_path,
+				                   init_kernel_source,
+				                   build_flags,
+				                   &cpProgram,
+				                   debug_src))
+				{
 					return false;
+				}
 
-				/* save binary for reuse */
-				if(!save_binary(clbin))
+				/* Save binary for reuse. */
+				if(!save_binary(&cpProgram, clbin)) {
 					return false;
+				}
 			}
 
-			/* cache the program */
-			OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
+			/* Cache the program. */
+			store_cached_kernel(cpPlatform,
+			                    cdDevice,
+			                    cpProgram,
+			                    OpenCLCache::OCL_DEV_BASE_PROGRAM,
+			                    cache_locker);
+		}
+		else {
+			VLOG(2) << "Found cached OpenCL kernel.";
 		}
 
-		/* find kernels */
-		ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-
-		ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-
-		ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-
-		ckShaderKernel = clCreateKernel(cpProgram, "kernel_ocl_shader", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
+		/* Find kernels. */
+#define FIND_KERNEL(kernel_var, kernel_name) \
+		do { \
+			kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
+			if(opencl_error(ciErr)) \
+				return false; \
+		} while(0)
 
-		ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
+		FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
+		FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
+		FIND_KERNEL(ckShaderKernel, "shader");
+		FIND_KERNEL(ckBakeKernel, "bake");
 
+#undef FIND_KERNEL
 		return true;
 	}
 
-	~OpenCLDevice()
+	~OpenCLDeviceBase()
 	{
 		task_pool.stop();
 
@@ -761,12 +1055,14 @@ public:
 			delete mt->second;
 		}
 
-		if(ckPathTraceKernel)
-			clReleaseKernel(ckPathTraceKernel);  
 		if(ckFilmConvertByteKernel)
-			clReleaseKernel(ckFilmConvertByteKernel);  
+			clReleaseKernel(ckFilmConvertByteKernel);
 		if(ckFilmConvertHalfFloatKernel)
-			clReleaseKernel(ckFilmConvertHalfFloatKernel);  
+			clReleaseKernel(ckFilmConvertHalfFloatKernel);
+		if(ckShaderKernel)
+			clReleaseKernel(ckShaderKernel);
+		if(ckBakeKernel)
+			clReleaseKernel(ckBakeKernel);
 		if(cpProgram)
 			clReleaseProgram(cpProgram);
 		if(cqCommandQueue)
@@ -789,9 +1085,22 @@ public:
 		else
 			mem_flag = CL_MEM_READ_WRITE;
 
-		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-
-		opencl_assert_err(ciErr, "clCreateBuffer");
+		/* Zero-size allocation might be invoked by render, but not really
+		 * supported by OpenCL. Using NULL as device pointer also doesn't really
+		 * work for some reason, so for the time being we'll use special case
+		 * will null_mem buffer.
+		 */
+		if(size != 0) {
+			mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
+			                                                mem_flag,
+			                                                size,
+			                                                mem_ptr,
+			                                                &ciErr);
+			opencl_assert_err(ciErr, "clCreateBuffer");
+		}
+		else {
+			mem.device_pointer = null_mem;
+		}
 
 		stats.mem_alloc(size);
 		mem.device_size = size;
@@ -801,15 +1110,31 @@ public:
 	{
 		/* this is blocking */
 		size_t size = mem.memory_size();
-		opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL));
+		if(size != 0){
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   size,
+			                                   (void*)mem.data_pointer,
+			                                   0,
+			                                   NULL, NULL));
+		}
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 	{
 		size_t offset = elem*y*w;
 		size_t size = elem*w*h;
-
-		opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL));
+		assert(size != 0);
+		opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
+		                                  CL_MEM_PTR(mem.device_pointer),
+		                                  CL_TRUE,
+		                                  offset,
+		                                  size,
+		                                  (uchar*)mem.data_pointer + offset,
+		                                  0,
+		                                  NULL, NULL));
 	}
 
 	void mem_zero(device_memory& mem)
@@ -823,7 +1148,9 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+			if(mem.device_pointer != null_mem) {
+				opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+			}
 			mem.device_pointer = 0;
 
 			stats.mem_free(mem.device_size);
@@ -850,8 +1177,12 @@ public:
 		mem_copy_to(*i->second);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType /*interpolation*/,
+	               ExtensionType /*extension*/)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		mem_alloc(mem, MEM_READ_ONLY);
 		mem_copy_to(mem);
 		assert(mem_map.find(name) == mem_map.end());
@@ -904,42 +1235,6 @@ public:
 		opencl_assert(clFlush(cqCommandQueue));
 	}
 
-	void path_trace(RenderTile& rtile, int sample)
-	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_sample = sample;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* sample arguments */
-		cl_uint narg = 0;
-
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
-
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
-#include "kernel_textures.h"
-
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
-	}
-
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
 	{
 		cl_mem ptr;
@@ -970,29 +1265,30 @@ public:
 		cl_int d_offset = task.offset;
 		cl_int d_stride = task.stride;
 
-		/* sample arguments */
-		cl_uint narg = 0;
-
 
 		cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
 
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
+		cl_uint start_arg_index =
+			kernel_set_args(ckFilmConvertKernel,
+			                0,
+			                d_data,
+			                d_rgba,
+			                d_buffer);
 
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
+	set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 #include "kernel_textures.h"
-
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckFilmConvertKernel,
+		                                   start_arg_index,
+		                                   d_sample_scale,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
 
 		enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 	}
@@ -1008,9 +1304,6 @@ public:
 		cl_int d_shader_w = task.shader_w;
 		cl_int d_offset = task.offset;
 
-		/* sample arguments */
-		cl_uint narg = 0;
-
 		cl_kernel kernel;
 
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE)
@@ -1025,19 +1318,25 @@ public:
 
 			cl_int d_sample = sample;
 
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+			cl_uint start_arg_index =
+				kernel_set_args(kernel,
+				                0,
+				                d_data,
+				                d_input,
+				                d_output);
 
 #define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(kernel, &narg, #name);
+		set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
+#undef KERNEL_TEX
 
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_shader_eval_type,
+			                                   d_shader_x,
+			                                   d_shader_w,
+			                                   d_offset,
+			                                   d_sample);
 
 			enqueue_kernel(kernel, task.shader_w, 1);
 
@@ -1045,6 +1344,420 @@ public:
 		}
 	}
 
+	class OpenCLDeviceTask : public DeviceTask {
+	public:
+		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+		: DeviceTask(task)
+		{
+			run = function_bind(&OpenCLDeviceBase::thread_run,
+			                    device,
+			                    this);
+		}
+	};
+
+	int get_split_task_count(DeviceTask& /*task*/)
+	{
+		return 1;
+	}
+
+	void task_add(DeviceTask& task)
+	{
+		task_pool.push(new OpenCLDeviceTask(this, task));
+	}
+
+	void task_wait()
+	{
+		task_pool.wait();
+	}
+
+	void task_cancel()
+	{
+		task_pool.cancel();
+	}
+
+	virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+	string kernel_build_options(const string *debug_src = NULL)
+	{
+		string build_options = " -cl-fast-relaxed-math ";
+
+		if(platform_name == "NVIDIA CUDA") {
+			build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
+			                 "-cl-nv-maxrregcount=32 "
+			                 "-cl-nv-verbose ";
+
+			uint compute_capability_major, compute_capability_minor;
+			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+			                sizeof(cl_uint), &compute_capability_major, NULL);
+			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+			                sizeof(cl_uint), &compute_capability_minor, NULL);
+
+			build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+			                               compute_capability_major * 100 +
+			                               compute_capability_minor * 10);
+		}
+
+		else if(platform_name == "Apple")
+			build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+		else if(platform_name == "AMD Accelerated Parallel Processing")
+			build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+		else if(platform_name == "Intel(R) OpenCL") {
+			build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+			/* Options for gdb source level kernel debugging.
+			 * this segfaults on linux currently.
+			 */
+			if(opencl_kernel_use_debug() && debug_src)
+				build_options += "-g -s \"" + *debug_src + "\" ";
+		}
+
+		if(opencl_kernel_use_debug())
+			build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+
+#ifdef WITH_CYCLES_DEBUG
+		build_options += "-D__KERNEL_DEBUG__ ";
+#endif
+
+		return build_options;
+	}
+
+	class ArgumentWrapper {
+	public:
+		ArgumentWrapper() : size(0), pointer(NULL) {}
+		template <typename T>
+		ArgumentWrapper(T& argument) : size(sizeof(argument)),
+		                               pointer(&argument) { }
+		size_t size;
+		void *pointer;
+	};
+
+	/* TODO(sergey): In the future we can use variadic templates, once
+	 * C++0x is allowed. Should allow to clean this up a bit.
+	 */
+	int kernel_set_args(cl_kernel kernel,
+	                    int start_argument_index,
+	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg33 = ArgumentWrapper())
+	{
+		int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+		do { \
+			if(arg.pointer != NULL) { \
+				opencl_assert(clSetKernelArg( \
+					kernel, \
+					start_argument_index + current_arg_index, \
+					arg.size, arg.pointer)); \
+				++current_arg_index; \
+			} \
+			else { \
+				return current_arg_index; \
+			} \
+		} while(false)
+		FAKE_VARARG_HANDLE_ARG(arg1);
+		FAKE_VARARG_HANDLE_ARG(arg2);
+		FAKE_VARARG_HANDLE_ARG(arg3);
+		FAKE_VARARG_HANDLE_ARG(arg4);
+		FAKE_VARARG_HANDLE_ARG(arg5);
+		FAKE_VARARG_HANDLE_ARG(arg6);
+		FAKE_VARARG_HANDLE_ARG(arg7);
+		FAKE_VARARG_HANDLE_ARG(arg8);
+		FAKE_VARARG_HANDLE_ARG(arg9);
+		FAKE_VARARG_HANDLE_ARG(arg10);
+		FAKE_VARARG_HANDLE_ARG(arg11);
+		FAKE_VARARG_HANDLE_ARG(arg12);
+		FAKE_VARARG_HANDLE_ARG(arg13);
+		FAKE_VARARG_HANDLE_ARG(arg14);
+		FAKE_VARARG_HANDLE_ARG(arg15);
+		FAKE_VARARG_HANDLE_ARG(arg16);
+		FAKE_VARARG_HANDLE_ARG(arg17);
+		FAKE_VARARG_HANDLE_ARG(arg18);
+		FAKE_VARARG_HANDLE_ARG(arg19);
+		FAKE_VARARG_HANDLE_ARG(arg20);
+		FAKE_VARARG_HANDLE_ARG(arg21);
+		FAKE_VARARG_HANDLE_ARG(arg22);
+		FAKE_VARARG_HANDLE_ARG(arg23);
+		FAKE_VARARG_HANDLE_ARG(arg24);
+		FAKE_VARARG_HANDLE_ARG(arg25);
+		FAKE_VARARG_HANDLE_ARG(arg26);
+		FAKE_VARARG_HANDLE_ARG(arg27);
+		FAKE_VARARG_HANDLE_ARG(arg28);
+		FAKE_VARARG_HANDLE_ARG(arg29);
+		FAKE_VARARG_HANDLE_ARG(arg30);
+		FAKE_VARARG_HANDLE_ARG(arg31);
+		FAKE_VARARG_HANDLE_ARG(arg32);
+		FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+		return current_arg_index;
+	}
+
+	inline void release_kernel_safe(cl_kernel kernel)
+	{
+		if(kernel) {
+			clReleaseKernel(kernel);
+		}
+	}
+
+	inline void release_mem_object_safe(cl_mem mem)
+	{
+		if(mem != NULL) {
+			clReleaseMemObject(mem);
+		}
+	}
+
+	inline void release_program_safe(cl_program program)
+	{
+		if(program) {
+			clReleaseProgram(program);
+		}
+	}
+
+	string build_options_from_requested_features(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		string build_options = "";
+		if(requested_features.experimental) {
+			build_options += " -D__KERNEL_EXPERIMENTAL__";
+		}
+		build_options += " -D__NODES_MAX_GROUP__=" +
+			string_printf("%d", requested_features.max_nodes_group);
+		build_options += " -D__NODES_FEATURES__=" +
+			string_printf("%d", requested_features.nodes_features);
+		build_options += string_printf(" -D__MAX_CLOSURE__=%d",
+		                               requested_features.max_closure);
+		if(!requested_features.use_hair) {
+			build_options += " -D__NO_HAIR__";
+		}
+		if(!requested_features.use_object_motion) {
+			build_options += " -D__NO_OBJECT_MOTION__";
+		}
+		if(!requested_features.use_camera_motion) {
+			build_options += " -D__NO_CAMERA_MOTION__";
+		}
+		if(!requested_features.use_baking) {
+			build_options += " -D__NO_BAKING__";
+		}
+		return build_options;
+	}
+
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	virtual cl_program load_cached_kernel(
+	        const DeviceRequestedFeatures& /*requested_features*/,
+	        OpenCLCache::ProgramName program_name,
+	        thread_scoped_lock& cache_locker)
+	{
+		return OpenCLCache::get_program(cpPlatform,
+		                                cdDevice,
+		                                program_name,
+		                                cache_locker);
+	}
+
+	virtual void store_cached_kernel(cl_platform_id platform,
+	                                 cl_device_id device,
+	                                 cl_program program,
+	                                 OpenCLCache::ProgramName program_name,
+	                                 thread_scoped_lock& cache_locker)
+	{
+		OpenCLCache::store_program(platform,
+		                           device,
+		                           program,
+		                           program_name,
+		                           cache_locker);
+	}
+
+	virtual string build_options_for_base_program(
+	        const DeviceRequestedFeatures& /*requested_features*/)
+	{
+		/* TODO(sergey): By default we compile all features, meaning
+		 * mega kernel is not getting feature-based optimizations.
+		 *
+		 * Ideally we need always compile kernel with as less features
+		 * enabled as possible to keep performance at it's max.
+		 */
+		return "";
+	}
+};
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+	cl_kernel ckPathTraceKernel;
+	cl_program path_trace_program;
+
+	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_)
+	{
+		ckPathTraceKernel = NULL;
+		path_trace_program = NULL;
+	}
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
+	{
+		/* Get Shader, bake and film convert kernels.
+		 * It'll also do verification of OpenCL actually initialized.
+		 */
+		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+			return false;
+		}
+
+		/* Try to use cached kernel. */
+		thread_scoped_lock cache_locker;
+		path_trace_program = OpenCLCache::get_program(cpPlatform,
+		                                              cdDevice,
+		                                              OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+		                                              cache_locker);
+
+		if(!path_trace_program) {
+			/* Verify we have right opencl version. */
+			if(!opencl_version_check())
+				return false;
+
+			/* Calculate md5 hash to detect changes. */
+			string kernel_path = path_get("kernel");
+			string kernel_md5 = path_files_md5_hash(kernel_path);
+			string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
+			string device_md5 = device_md5_hash(custom_kernel_build_options);
+
+			/* Path to cached binary. */
+			string clbin = string_printf("cycles_kernel_%s_%s.clbin",
+			                             device_md5.c_str(),
+			                             kernel_md5.c_str());
+			clbin = path_user_get(path_join("cache", clbin));
+
+			/* Path to preprocessed source for debugging. */
+			string clsrc, *debug_src = NULL;
+			if(opencl_kernel_use_debug()) {
+				clsrc = string_printf("cycles_kernel_%s_%s.cl",
+				                      device_md5.c_str(),
+				                      kernel_md5.c_str());
+				clsrc = path_user_get(path_join("cache", clsrc));
+				debug_src = &clsrc;
+			}
+
+			/* If exists already, try use it. */
+			if(path_exists(clbin) && load_binary(kernel_path,
+			                                     clbin,
+			                                     custom_kernel_build_options,
+			                                     &path_trace_program,
+			                                     debug_src)) {
+				/* Kernel loaded from binary, nothing to do. */
+			}
+			else {
+				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
+				                            kernel_md5 + "\n";
+				/* If does not exist or loading binary failed, compile kernel. */
+				if(!compile_kernel(kernel_path,
+				                   init_kernel_source,
+				                   custom_kernel_build_options,
+				                   &path_trace_program,
+				                   debug_src))
+				{
+					return false;
+				}
+				/* Save binary for reuse. */
+				if(!save_binary(&path_trace_program, clbin)) {
+					return false;
+				}
+			}
+			/* Cache the program. */
+			OpenCLCache::store_program(cpPlatform,
+			                           cdDevice,
+			                           path_trace_program,
+			                           OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+			                           cache_locker);
+		}
+
+		/* Find kernels. */
+		ckPathTraceKernel = clCreateKernel(path_trace_program,
+		                                   "kernel_ocl_path_trace",
+		                                   &ciErr);
+		if(opencl_error(ciErr))
+			return false;
+		return true;
+	}
+
+	~OpenCLDeviceMegaKernel()
+	{
+		task_pool.stop();
+		release_kernel_safe(ckPathTraceKernel);
+		release_program_safe(path_trace_program);
+	}
+
+	void path_trace(RenderTile& rtile, int sample)
+	{
+		/* Cast arguments to cl types. */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Sample arguments. */
+		cl_int d_sample = sample;
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel,
+			                0,
+			                d_data,
+			                d_buffer,
+			                d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckPathTraceKernel,
+		                                   start_arg_index,
+		                                   d_sample,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
+
+		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+	}
+
 	void thread_run(DeviceTask *task)
 	{
 		if(task->type == DeviceTask::FILM_CONVERT) {
@@ -1055,8 +1768,7 @@ public:
 		}
 		else if(task->type == DeviceTask::PATH_TRACE) {
 			RenderTile tile;
-			
-			/* keep rendering tiles until done */
+			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
 				int start_sample = tile.start_sample;
 				int end_sample = tile.start_sample + tile.num_samples;
@@ -1074,61 +1786,1847 @@ public:
 					task->update_progress(&tile);
 				}
 
+				/* Complete kernel execution before release tile */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
 				task->release_tile(tile);
 			}
 		}
 	}
+};
 
-	class OpenCLDeviceTask : public DeviceTask {
-	public:
-		OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
-		: DeviceTask(task)
+/* TODO(sergey): This is to keep tile split on OpenCL level working
+ * for now, since without this view-port render does not work as it
+ * should.
+ *
+ * Ideally it'll be done on the higher level, but we need to get ready
+ * for merge rather soon, so let's keep split logic private here in
+ * the file.
+ */
+class SplitRenderTile : public RenderTile {
+public:
+	SplitRenderTile()
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0) {}
+
+	explicit SplitRenderTile(RenderTile& tile)
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0)
+	{
+		x = tile.x;
+		y = tile.y;
+		w = tile.w;
+		h = tile.h;
+		start_sample = tile.start_sample;
+		num_samples = tile.num_samples;
+		sample = tile.sample;
+		resolution = tile.resolution;
+		offset = tile.offset;
+		stride = tile.stride;
+		buffer = tile.buffer;
+		rng_state = tile.rng_state;
+		buffers = tile.buffers;
+	}
+
+	/* Split kernel is device global memory constrained;
+	 * hence split kernel cant render big tile size's in
+	 * one go. If the user sets a big tile size (big tile size
+	 * is a term relative to the available device global memory),
+	 * we split the tile further and then call path_trace on
+	 * each of those split tiles. The following variables declared,
+	 * assist in achieving that purpose
+	 */
+	int buffer_offset_x;
+	int buffer_offset_y;
+	int rng_state_offset_x;
+	int rng_state_offset_y;
+	int buffer_rng_state_stride;
+};
+
+/* OpenCLDeviceSplitKernel's declaration/definition. */
+class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
+{
+public:
+	/* Kernel declaration. */
+	cl_kernel ckPathTraceKernel_data_init;
+	cl_kernel ckPathTraceKernel_scene_intersect;
+	cl_kernel ckPathTraceKernel_lamp_emission;
+	cl_kernel ckPathTraceKernel_queue_enqueue;
+	cl_kernel ckPathTraceKernel_background_buffer_update;
+	cl_kernel ckPathTraceKernel_shader_eval;
+	cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
+	cl_kernel ckPathTraceKernel_direct_lighting;
+	cl_kernel ckPathTraceKernel_shadow_blocked;
+	cl_kernel ckPathTraceKernel_next_iteration_setup;
+	cl_kernel ckPathTraceKernel_sum_all_radiance;
+
+	/* cl_program declaration. */
+	cl_program data_init_program;
+	cl_program scene_intersect_program;
+	cl_program lamp_emission_program;
+	cl_program queue_enqueue_program;
+	cl_program background_buffer_update_program;
+	cl_program shader_eval_program;
+	cl_program holdout_emission_blurring_pathtermination_ao_program;
+	cl_program direct_lighting_program;
+	cl_program shadow_blocked_program;
+	cl_program next_iteration_setup_program;
+	cl_program sum_all_radiance_program;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	cl_mem rng_coop;
+	cl_mem throughput_coop;
+	cl_mem L_transparent_coop;
+	cl_mem PathRadiance_coop;
+	cl_mem Ray_coop;
+	cl_mem PathState_coop;
+	cl_mem Intersection_coop;
+	cl_mem kgbuffer;  /* KernelGlobals buffer. */
+
+	/* Global buffers for ShaderData. */
+	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
+	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
+	                        * shadow_blocked kernel.
+	                        */
+
+	/* Global buffers of each member of ShaderData. */
+	cl_mem P_sd;
+	cl_mem P_sd_DL_shadow;
+	cl_mem N_sd;
+	cl_mem N_sd_DL_shadow;
+	cl_mem Ng_sd;
+	cl_mem Ng_sd_DL_shadow;
+	cl_mem I_sd;
+	cl_mem I_sd_DL_shadow;
+	cl_mem shader_sd;
+	cl_mem shader_sd_DL_shadow;
+	cl_mem flag_sd;
+	cl_mem flag_sd_DL_shadow;
+	cl_mem prim_sd;
+	cl_mem prim_sd_DL_shadow;
+	cl_mem type_sd;
+	cl_mem type_sd_DL_shadow;
+	cl_mem u_sd;
+	cl_mem u_sd_DL_shadow;
+	cl_mem v_sd;
+	cl_mem v_sd_DL_shadow;
+	cl_mem object_sd;
+	cl_mem object_sd_DL_shadow;
+	cl_mem time_sd;
+	cl_mem time_sd_DL_shadow;
+	cl_mem ray_length_sd;
+	cl_mem ray_length_sd_DL_shadow;
+	cl_mem ray_depth_sd;
+	cl_mem ray_depth_sd_DL_shadow;
+	cl_mem transparent_depth_sd;
+	cl_mem transparent_depth_sd_DL_shadow;
+
+	/* Ray differentials. */
+	cl_mem dP_sd, dI_sd;
+	cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
+	cl_mem du_sd, dv_sd;
+	cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
+
+	/* Dp/Du */
+	cl_mem dPdu_sd, dPdv_sd;
+	cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
+
+	/* Object motion. */
+	cl_mem ob_tfm_sd, ob_itfm_sd;
+	cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
+
+	cl_mem closure_sd;
+	cl_mem closure_sd_DL_shadow;
+	cl_mem num_closure_sd;
+	cl_mem num_closure_sd_DL_shadow;
+	cl_mem randb_closure_sd;
+	cl_mem randb_closure_sd_DL_shadow;
+	cl_mem ray_P_sd;
+	cl_mem ray_P_sd_DL_shadow;
+	cl_mem ray_dP_sd;
+	cl_mem ray_dP_sd_DL_shadow;
+
+	/* Global memory required for shadow blocked and accum_radiance. */
+	cl_mem BSDFEval_coop;
+	cl_mem ISLamp_coop;
+	cl_mem LightRay_coop;
+	cl_mem AOAlpha_coop;
+	cl_mem AOBSDF_coop;
+	cl_mem AOLightRay_coop;
+	cl_mem Intersection_coop_AO;
+	cl_mem Intersection_coop_DL;
+
+#ifdef WITH_CYCLES_DEBUG
+	/* DebugData memory */
+	cl_mem debugdata_coop;
+#endif
+
+	/* Global state array that tracks ray state. */
+	cl_mem ray_state;
+
+	/* Per sample buffers. */
+	cl_mem per_sample_output_buffers;
+
+	/* Denotes which sample each ray is being processed for. */
+	cl_mem work_array;
+
+	/* Queue */
+	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
+	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
+	                     * Tracks the size of each queue.
+	                     */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	cl_mem use_queues_flag;
+
+	/* Amount of memory in output buffer associated with one pixel/thread. */
+	size_t per_thread_output_buffer_size;
+
+	/* Total allocatable available device memory. */
+	size_t total_allocatable_memory;
+
+	/* host version of ray_state; Used in checking host path-iteration
+	 * termination.
+	 */
+	char *hostRayStateArray;
+
+	/* Number of path-iterations to be done in one shot. */
+	unsigned int PathIteration_times;
+
+#ifdef __WORK_STEALING__
+	/* Work pool with respect to each work group. */
+	cl_mem work_pool_wgs;
+
+	/* Denotes the maximum work groups possible w.r.t. current tile size. */
+	unsigned int max_work_groups;
+#endif
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_)
+	{
+		background = background_;
+
+		/* Initialize kernels. */
+		ckPathTraceKernel_data_init = NULL;
+		ckPathTraceKernel_scene_intersect = NULL;
+		ckPathTraceKernel_lamp_emission = NULL;
+		ckPathTraceKernel_background_buffer_update = NULL;
+		ckPathTraceKernel_shader_eval = NULL;
+		ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
+		ckPathTraceKernel_direct_lighting = NULL;
+		ckPathTraceKernel_shadow_blocked = NULL;
+		ckPathTraceKernel_next_iteration_setup = NULL;
+		ckPathTraceKernel_sum_all_radiance = NULL;
+		ckPathTraceKernel_queue_enqueue = NULL;
+
+		/* Initialize program. */
+		data_init_program = NULL;
+		scene_intersect_program = NULL;
+		lamp_emission_program = NULL;
+		queue_enqueue_program = NULL;
+		background_buffer_update_program = NULL;
+		shader_eval_program = NULL;
+		holdout_emission_blurring_pathtermination_ao_program = NULL;
+		direct_lighting_program = NULL;
+		shadow_blocked_program = NULL;
+		next_iteration_setup_program = NULL;
+		sum_all_radiance_program = NULL;
+
+		/* Initialize cl_mem variables. */
+		kgbuffer = NULL;
+		sd = NULL;
+		sd_DL_shadow = NULL;
+
+		P_sd = NULL;
+		P_sd_DL_shadow = NULL;
+		N_sd = NULL;
+		N_sd_DL_shadow = NULL;
+		Ng_sd = NULL;
+		Ng_sd_DL_shadow = NULL;
+		I_sd = NULL;
+		I_sd_DL_shadow = NULL;
+		shader_sd = NULL;
+		shader_sd_DL_shadow = NULL;
+		flag_sd = NULL;
+		flag_sd_DL_shadow = NULL;
+		prim_sd = NULL;
+		prim_sd_DL_shadow = NULL;
+		type_sd = NULL;
+		type_sd_DL_shadow = NULL;
+		u_sd = NULL;
+		u_sd_DL_shadow = NULL;
+		v_sd = NULL;
+		v_sd_DL_shadow = NULL;
+		object_sd = NULL;
+		object_sd_DL_shadow = NULL;
+		time_sd = NULL;
+		time_sd_DL_shadow = NULL;
+		ray_length_sd = NULL;
+		ray_length_sd_DL_shadow = NULL;
+		ray_depth_sd = NULL;
+		ray_depth_sd_DL_shadow = NULL;
+		transparent_depth_sd = NULL;
+		transparent_depth_sd_DL_shadow = NULL;
+
+		/* Ray differentials. */
+		dP_sd = NULL;
+		dI_sd = NULL;
+		dP_sd_DL_shadow = NULL;
+		dI_sd_DL_shadow = NULL;
+		du_sd = NULL;
+		dv_sd = NULL;
+		du_sd_DL_shadow = NULL;
+		dv_sd_DL_shadow = NULL;
+
+		/* Dp/Du */
+		dPdu_sd = NULL;
+		dPdv_sd = NULL;
+		dPdu_sd_DL_shadow = NULL;
+		dPdv_sd_DL_shadow = NULL;
+
+		/* Object motion. */
+		ob_tfm_sd = NULL;
+		ob_itfm_sd = NULL;
+		ob_tfm_sd_DL_shadow = NULL;
+		ob_itfm_sd_DL_shadow = NULL;
+
+		closure_sd = NULL;
+		closure_sd_DL_shadow = NULL;
+		num_closure_sd = NULL;
+		num_closure_sd_DL_shadow = NULL;
+		randb_closure_sd = NULL;
+		randb_closure_sd_DL_shadow = NULL;
+		ray_P_sd = NULL;
+		ray_P_sd_DL_shadow = NULL;
+		ray_dP_sd = NULL;
+		ray_dP_sd_DL_shadow = NULL;
+
+		rng_coop = NULL;
+		throughput_coop = NULL;
+		L_transparent_coop = NULL;
+		PathRadiance_coop = NULL;
+		Ray_coop = NULL;
+		PathState_coop = NULL;
+		Intersection_coop = NULL;
+		ray_state = NULL;
+
+		AOAlpha_coop = NULL;
+		AOBSDF_coop = NULL;
+		AOLightRay_coop = NULL;
+		BSDFEval_coop = NULL;
+		ISLamp_coop = NULL;
+		LightRay_coop = NULL;
+		Intersection_coop_AO = NULL;
+		Intersection_coop_DL = NULL;
+
+#ifdef WITH_CYCLES_DEBUG
+		debugdata_coop = NULL;
+#endif
+
+		work_array = NULL;
+
+		/* Queue. */
+		Queue_data = NULL;
+		Queue_index = NULL;
+		use_queues_flag = NULL;
+
+		per_sample_output_buffers = NULL;
+
+		per_thread_output_buffer_size = 0;
+		hostRayStateArray = NULL;
+		PathIteration_times = PATH_ITER_INC_FACTOR;
+#ifdef __WORK_STEALING__
+		work_pool_wgs = NULL;
+		max_work_groups = 0;
+#endif
+		current_max_closure = -1;
+		first_tile = true;
+
+		/* Get device's maximum memory that can be allocated. */
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+		                        sizeof(size_t),
+		                        &total_allocatable_memory,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(platform_name == "AMD Accelerated Parallel Processing") {
+			/* This value is tweak-able; AMD platform does not seem to
+			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
+			 * is considered for further computation.
+			 */
+			total_allocatable_memory /= 2;
+		}
+	}
+
+	/* TODO(sergey): Seems really close to load_kernel(),
+	 * could it be de-duplicated?
+	 */
+	bool load_split_kernel(string kernel_path,
+	                       string kernel_init_source,
+	                       string clbin,
+	                       string custom_kernel_build_options,
+	                       cl_program *program,
+	                       const string *debug_src = NULL)
+	{
+		if(!opencl_version_check())
+			return false;
+
+		clbin = path_user_get(path_join("cache", clbin));
+
+		/* If exists already, try use it. */
+		if(path_exists(clbin) && load_binary(kernel_path,
+		                                     clbin,
+		                                     custom_kernel_build_options,
+		                                     program,
+		                                     debug_src)) {
+			/* Kernel loaded from binary. */
+		}
+		else {
+			/* If does not exist or loading binary failed, compile kernel. */
+			if(!compile_kernel(kernel_path,
+			                   kernel_init_source,
+			                   custom_kernel_build_options,
+			                   program,
+			                   debug_src))
+			{
+				return false;
+			}
+			/* Save binary for reuse. */
+			if(!save_binary(program, clbin)) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/* Split kernel utility functions. */
+	size_t get_tex_size(const char *tex_name)
+	{
+		cl_mem ptr;
+		size_t ret_size = 0;
+		MemMap::iterator i = mem_map.find(tex_name);
+		if(i != mem_map.end()) {
+			ptr = CL_MEM_PTR(i->second);
+			ciErr = clGetMemObjectInfo(ptr,
+			                           CL_MEM_SIZE,
+			                           sizeof(ret_size),
+			                           &ret_size,
+			                           NULL);
+			assert(ciErr == CL_SUCCESS);
+		}
+		return ret_size;
+	}
+
+	size_t get_shader_closure_size(int max_closure)
+	{
+		return (sizeof(ShaderClosure) * max_closure);
+	}
+
+	size_t get_shader_data_size(size_t shader_closure_size)
+	{
+		/* ShaderData size without accounting for ShaderClosure array. */
+		size_t shader_data_size =
+			sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
+		return (shader_data_size + shader_closure_size);
+	}
+
+	/* Returns size of KernelGlobals structure associated with OpenCL. */
+	size_t get_KernelGlobals_size()
+	{
+		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+		 * fetch its size.
+		 */
+		typedef struct KernelGlobals {
+			ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name;
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+		} KernelGlobals;
+
+		return sizeof(KernelGlobals);
+	}
+
+	/* Returns size of Structure of arrays implementation of. */
+	size_t get_shaderdata_soa_size()
+	{
+		size_t shader_soa_size = 0;
+
+#define SD_VAR(type, what) shader_soa_size += sizeof(void *);
+#define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
+		#include "kernel_shaderdata_vars.h"
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
+
+		return shader_soa_size;
+	}
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
+	{
+		/* Get Shader, bake and film_convert kernels.
+		 * It'll also do verification of OpenCL actually initialized.
+		 */
+		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+			return false;
+		}
+
+		string kernel_path = path_get("kernel");
+		string kernel_md5 = path_files_md5_hash(kernel_path);
+		string device_md5;
+		string kernel_init_source;
+		string clbin;
+		string clsrc, *debug_src = NULL;
+
+		string build_options = "-D__SPLIT_KERNEL__";
+#ifdef __WORK_STEALING__
+		build_options += " -D__WORK_STEALING__";
+#endif
+		build_options += build_options_from_requested_features(requested_features);
+
+		/* Set compute device build option. */
+		cl_device_type device_type;
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_TYPE,
+		                        sizeof(cl_device_type),
+		                        &device_type,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(device_type == CL_DEVICE_TYPE_GPU) {
+			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		}
+
+#define GLUE(a, b) a ## b
+#define LOAD_KERNEL(name) \
+	do { \
+		kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
+		                     kernel_md5 + "\n"; \
+		device_md5 = device_md5_hash(build_options); \
+		clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
+		                      device_md5.c_str(), kernel_md5.c_str()); \
+		if(opencl_kernel_use_debug()) { \
+			clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
+			                      device_md5.c_str(), kernel_md5.c_str()); \
+			clsrc = path_user_get(path_join("cache", clsrc)); \
+			debug_src = &clsrc; \
+		} \
+		if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
+		                      build_options, \
+		                      &GLUE(name, _program), \
+		                      debug_src)) \
+		{ \
+			fprintf(stderr, "Faled to compile %s\n", #name); \
+			return false; \
+		} \
+	} while(false)
+
+		LOAD_KERNEL(data_init);
+		LOAD_KERNEL(scene_intersect);
+		LOAD_KERNEL(lamp_emission);
+		LOAD_KERNEL(queue_enqueue);
+		LOAD_KERNEL(background_buffer_update);
+		LOAD_KERNEL(shader_eval);
+		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		LOAD_KERNEL(direct_lighting);
+		LOAD_KERNEL(shadow_blocked);
+		LOAD_KERNEL(next_iteration_setup);
+		LOAD_KERNEL(sum_all_radiance);
+
+#undef LOAD_KERNEL
+
+#define FIND_KERNEL(name) \
+	do { \
+		GLUE(ckPathTraceKernel_, name) = \
+			clCreateKernel(GLUE(name, _program), \
+			               "kernel_ocl_path_trace_"  #name, &ciErr); \
+		if(opencl_error(ciErr)) { \
+			fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
+			return false; \
+		} \
+	} while(false)
+
+		FIND_KERNEL(data_init);
+		FIND_KERNEL(scene_intersect);
+		FIND_KERNEL(lamp_emission);
+		FIND_KERNEL(queue_enqueue);
+		FIND_KERNEL(background_buffer_update);
+		FIND_KERNEL(shader_eval);
+		FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		FIND_KERNEL(direct_lighting);
+		FIND_KERNEL(shadow_blocked);
+		FIND_KERNEL(next_iteration_setup);
+		FIND_KERNEL(sum_all_radiance);
+#undef FIND_KERNEL
+#undef GLUE
+
+		current_max_closure = requested_features.max_closure;
+
+		return true;
+	}
+
+	~OpenCLDeviceSplitKernel()
+	{
+		task_pool.stop();
+
+		/* Release kernels */
+		release_kernel_safe(ckPathTraceKernel_data_init);
+		release_kernel_safe(ckPathTraceKernel_scene_intersect);
+		release_kernel_safe(ckPathTraceKernel_lamp_emission);
+		release_kernel_safe(ckPathTraceKernel_queue_enqueue);
+		release_kernel_safe(ckPathTraceKernel_background_buffer_update);
+		release_kernel_safe(ckPathTraceKernel_shader_eval);
+		release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
+		release_kernel_safe(ckPathTraceKernel_direct_lighting);
+		release_kernel_safe(ckPathTraceKernel_shadow_blocked);
+		release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
+		release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
+
+		/* Release global memory */
+		release_mem_object_safe(P_sd);
+		release_mem_object_safe(P_sd_DL_shadow);
+		release_mem_object_safe(N_sd);
+		release_mem_object_safe(N_sd_DL_shadow);
+		release_mem_object_safe(Ng_sd);
+		release_mem_object_safe(Ng_sd_DL_shadow);
+		release_mem_object_safe(I_sd);
+		release_mem_object_safe(I_sd_DL_shadow);
+		release_mem_object_safe(shader_sd);
+		release_mem_object_safe(shader_sd_DL_shadow);
+		release_mem_object_safe(flag_sd);
+		release_mem_object_safe(flag_sd_DL_shadow);
+		release_mem_object_safe(prim_sd);
+		release_mem_object_safe(prim_sd_DL_shadow);
+		release_mem_object_safe(type_sd);
+		release_mem_object_safe(type_sd_DL_shadow);
+		release_mem_object_safe(u_sd);
+		release_mem_object_safe(u_sd_DL_shadow);
+		release_mem_object_safe(v_sd);
+		release_mem_object_safe(v_sd_DL_shadow);
+		release_mem_object_safe(object_sd);
+		release_mem_object_safe(object_sd_DL_shadow);
+		release_mem_object_safe(time_sd);
+		release_mem_object_safe(time_sd_DL_shadow);
+		release_mem_object_safe(ray_length_sd);
+		release_mem_object_safe(ray_length_sd_DL_shadow);
+		release_mem_object_safe(ray_depth_sd);
+		release_mem_object_safe(ray_depth_sd_DL_shadow);
+		release_mem_object_safe(transparent_depth_sd);
+		release_mem_object_safe(transparent_depth_sd_DL_shadow);
+
+		/* Ray differentials. */
+		release_mem_object_safe(dP_sd);
+		release_mem_object_safe(dP_sd_DL_shadow);
+		release_mem_object_safe(dI_sd);
+		release_mem_object_safe(dI_sd_DL_shadow);
+		release_mem_object_safe(du_sd);
+		release_mem_object_safe(du_sd_DL_shadow);
+		release_mem_object_safe(dv_sd);
+		release_mem_object_safe(dv_sd_DL_shadow);
+
+		/* Dp/Du */
+		release_mem_object_safe(dPdu_sd);
+		release_mem_object_safe(dPdu_sd_DL_shadow);
+		release_mem_object_safe(dPdv_sd);
+		release_mem_object_safe(dPdv_sd_DL_shadow);
+
+		/* Object motion. */
+		release_mem_object_safe(ob_tfm_sd);
+		release_mem_object_safe(ob_itfm_sd);
+
+		release_mem_object_safe(ob_tfm_sd_DL_shadow);
+		release_mem_object_safe(ob_itfm_sd_DL_shadow);
+
+		release_mem_object_safe(closure_sd);
+		release_mem_object_safe(closure_sd_DL_shadow);
+		release_mem_object_safe(num_closure_sd);
+		release_mem_object_safe(num_closure_sd_DL_shadow);
+		release_mem_object_safe(randb_closure_sd);
+		release_mem_object_safe(randb_closure_sd_DL_shadow);
+		release_mem_object_safe(ray_P_sd);
+		release_mem_object_safe(ray_P_sd_DL_shadow);
+		release_mem_object_safe(ray_dP_sd);
+		release_mem_object_safe(ray_dP_sd_DL_shadow);
+		release_mem_object_safe(rng_coop);
+		release_mem_object_safe(throughput_coop);
+		release_mem_object_safe(L_transparent_coop);
+		release_mem_object_safe(PathRadiance_coop);
+		release_mem_object_safe(Ray_coop);
+		release_mem_object_safe(PathState_coop);
+		release_mem_object_safe(Intersection_coop);
+		release_mem_object_safe(kgbuffer);
+		release_mem_object_safe(sd);
+		release_mem_object_safe(sd_DL_shadow);
+		release_mem_object_safe(ray_state);
+		release_mem_object_safe(AOAlpha_coop);
+		release_mem_object_safe(AOBSDF_coop);
+		release_mem_object_safe(AOLightRay_coop);
+		release_mem_object_safe(BSDFEval_coop);
+		release_mem_object_safe(ISLamp_coop);
+		release_mem_object_safe(LightRay_coop);
+		release_mem_object_safe(Intersection_coop_AO);
+		release_mem_object_safe(Intersection_coop_DL);
+#ifdef WITH_CYCLES_DEBUG
+		release_mem_object_safe(debugdata_coop);
+#endif
+		release_mem_object_safe(use_queues_flag);
+		release_mem_object_safe(Queue_data);
+		release_mem_object_safe(Queue_index);
+		release_mem_object_safe(work_array);
+#ifdef __WORK_STEALING__
+		release_mem_object_safe(work_pool_wgs);
+#endif
+		release_mem_object_safe(per_sample_output_buffers);
+
+		/* Release programs */
+		release_program_safe(data_init_program);
+		release_program_safe(scene_intersect_program);
+		release_program_safe(lamp_emission_program);
+		release_program_safe(queue_enqueue_program);
+		release_program_safe(background_buffer_update_program);
+		release_program_safe(shader_eval_program);
+		release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
+		release_program_safe(direct_lighting_program);
+		release_program_safe(shadow_blocked_program);
+		release_program_safe(next_iteration_setup_program);
+		release_program_safe(sum_all_radiance_program);
+
+		if(hostRayStateArray != NULL) {
+			free(hostRayStateArray);
+		}
+	}
+
+	void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
+	{
+		/* cast arguments to cl types */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Make sure that set render feasible tile size is a multiple of local
+		 * work size dimensions.
+		 */
+		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
+		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+
+		size_t global_size[2];
+		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
+		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+
+		/* Set the range of samples to be processed for every ray in
+		 * path-regeneration logic.
+		 */
+		cl_int start_sample = rtile.start_sample;
+		cl_int end_sample = rtile.start_sample + rtile.num_samples;
+		cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_parallel_samples = 1;
+#else
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_threads = max_render_feasible_tile_size.x *
+		                           max_render_feasible_tile_size.y;
+		unsigned int num_tile_columns_possible = num_threads / global_size[1];
+		/* Estimate number of parallel samples that can be
+		 * processed in parallel.
+		 */
+		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
+		                                        rtile.num_samples);
+		/* Wavefront size in AMD is 64.
+		 * TODO(sergey): What about other platforms?
+		 */
+		if(num_parallel_samples >= 64) {
+			/* TODO(sergey): Could use generic round-up here. */
+			num_parallel_samples = (num_parallel_samples / 64) * 64;
+		}
+		assert(num_parallel_samples != 0);
+
+		global_size[0] = d_w * num_parallel_samples;
+#endif  /* __WORK_STEALING__ */
+
+		assert(global_size[0] * global_size[1] <=
+		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
+
+		/* Allocate all required global memory once. */
+		if(first_tile) {
+			size_t num_global_elements = max_render_feasible_tile_size.x *
+			                             max_render_feasible_tile_size.y;
+			/* TODO(sergey): This will actually over-allocate if
+			 * particular kernel does not support multiclosure.
+			 */
+			size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
+
+#ifdef __WORK_STEALING__
+			/* Calculate max groups */
+			size_t max_global_size[2];
+			size_t tile_x = max_render_feasible_tile_size.x;
+			size_t tile_y = max_render_feasible_tile_size.y;
+			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
+			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
+			max_work_groups = (max_global_size[0] * max_global_size[1]) /
+			                  (local_size[0] * local_size[1]);
+			/* Allocate work_pool_wgs memory. */
+			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+#endif  /* __WORK_STEALING__ */
+
+			/* Allocate queue_index memory only once. */
+			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
+			use_queues_flag = mem_alloc(sizeof(char));
+			kgbuffer = mem_alloc(get_KernelGlobals_size());
+
+			/* Create global buffers for ShaderData. */
+			sd = mem_alloc(get_shaderdata_soa_size());
+			sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
+			P_sd = mem_alloc(num_global_elements * sizeof(float3));
+			P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			N_sd = mem_alloc(num_global_elements * sizeof(float3));
+			N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
+			Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			I_sd = mem_alloc(num_global_elements * sizeof(float3));
+			I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			shader_sd = mem_alloc(num_global_elements * sizeof(int));
+			shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			flag_sd = mem_alloc(num_global_elements * sizeof(int));
+			flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			prim_sd = mem_alloc(num_global_elements * sizeof(int));
+			prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			type_sd = mem_alloc(num_global_elements * sizeof(int));
+			type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			u_sd = mem_alloc(num_global_elements * sizeof(float));
+			u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			v_sd = mem_alloc(num_global_elements * sizeof(float));
+			v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			object_sd = mem_alloc(num_global_elements * sizeof(int));
+			object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			time_sd = mem_alloc(num_global_elements * sizeof(float));
+			time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
+			ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+			ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+			transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+
+			/* Ray differentials. */
+			dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+			dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+			du_sd = mem_alloc(num_global_elements * sizeof(differential));
+			du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+			dv_sd = mem_alloc(num_global_elements * sizeof(differential));
+			dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+
+			/* Dp/Du */
+			dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
+			dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
+			dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+
+			/* Object motion. */
+			ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
+			ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
+			ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
+			ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
+
+			closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
+			closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
+			num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
+			num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
+			randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
+			ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+
+			/* Creation of global memory buffers which are shared among
+			 * the kernels.
+			 */
+			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
+			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
+			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
+			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
+			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
+			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
+			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
+			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
+			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
+			Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
+
+#ifdef WITH_CYCLES_DEBUG
+			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
+#endif
+
+			ray_state = mem_alloc(num_global_elements * sizeof(char));
+
+			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
+			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+
+			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
+			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
+			per_sample_output_buffers = mem_alloc(num_global_elements *
+			                                      per_thread_output_buffer_size);
+		}
+
+		cl_int dQueue_size = global_size[0] * global_size[1];
+		cl_int total_num_rays = global_size[0] * global_size[1];
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                0,
+			                kgbuffer,
+			                sd,
+			                sd_DL_shadow,
+			                P_sd,
+			                P_sd_DL_shadow,
+			                N_sd,
+			                N_sd_DL_shadow,
+			                Ng_sd,
+			                Ng_sd_DL_shadow,
+			                I_sd,
+			                I_sd_DL_shadow,
+			                shader_sd,
+			                shader_sd_DL_shadow,
+			                flag_sd,
+			                flag_sd_DL_shadow,
+			                prim_sd,
+			                prim_sd_DL_shadow,
+			                type_sd,
+			                type_sd_DL_shadow,
+			                u_sd,
+			                u_sd_DL_shadow,
+			                v_sd,
+			                v_sd_DL_shadow,
+			                object_sd,
+			                object_sd_DL_shadow,
+			                time_sd,
+			                time_sd_DL_shadow,
+			                ray_length_sd,
+			                ray_length_sd_DL_shadow,
+			                ray_depth_sd,
+			                ray_depth_sd_DL_shadow,
+			                transparent_depth_sd,
+			                transparent_depth_sd_DL_shadow);
+
+		/* Ray differentials. */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                dP_sd,
+			                dP_sd_DL_shadow,
+			                dI_sd,
+			                dI_sd_DL_shadow,
+			                du_sd,
+			                du_sd_DL_shadow,
+			                dv_sd,
+			                dv_sd_DL_shadow);
+
+		/* Dp/Du */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                dPdu_sd,
+			                dPdu_sd_DL_shadow,
+			                dPdv_sd,
+			                dPdv_sd_DL_shadow);
+
+		/* Object motion. */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                ob_tfm_sd,
+			                ob_tfm_sd_DL_shadow,
+			                ob_itfm_sd,
+			                ob_itfm_sd_DL_shadow);
+
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                closure_sd,
+			                closure_sd_DL_shadow,
+			                num_closure_sd,
+			                num_closure_sd_DL_shadow,
+			                randb_closure_sd,
+			                randb_closure_sd_DL_shadow,
+			                ray_P_sd,
+			                ray_P_sd_DL_shadow,
+			                ray_dP_sd,
+			                ray_dP_sd_DL_shadow,
+			                d_data,
+			                per_sample_output_buffers,
+			                d_rng_state,
+			                rng_coop,
+			                throughput_coop,
+			                L_transparent_coop,
+			                PathRadiance_coop,
+			                Ray_coop,
+			                PathState_coop,
+			                ray_state);
+
+/* TODO(segrey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+	set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                start_sample,
+			                d_x,
+			                d_y,
+			                d_w,
+			                d_h,
+			                d_offset,
+			                d_stride,
+			                rtile.rng_state_offset_x,
+			                rtile.rng_state_offset_y,
+			                rtile.buffer_rng_state_stride,
+			                Queue_data,
+			                Queue_index,
+			                dQueue_size,
+			                use_queues_flag,
+			                work_array,
+#ifdef __WORK_STEALING__
+			                work_pool_wgs,
+			                num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+			                debugdata_coop,
+#endif
+			                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_scene_intersect,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+#ifdef WITH_CYCLES_DEBUG
+		                debugdata_coop,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_lamp_emission,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_queue_enqueue,
+		                0,
+		                Queue_data,
+		                Queue_index,
+		                ray_state,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_background_buffer_update,
+		                 0,
+		                 kgbuffer,
+		                 d_data,
+		                 sd,
+		                 per_sample_output_buffers,
+		                 d_rng_state,
+		                 rng_coop,
+		                 throughput_coop,
+		                 PathRadiance_coop,
+		                 Ray_coop,
+		                 PathState_coop,
+		                 L_transparent_coop,
+		                 ray_state,
+		                 d_w,
+		                 d_h,
+		                 d_x,
+		                 d_y,
+		                 d_stride,
+		                 rtile.rng_state_offset_x,
+		                 rtile.rng_state_offset_y,
+		                 rtile.buffer_rng_state_stride,
+		                 work_array,
+		                 Queue_data,
+		                 Queue_index,
+		                 dQueue_size,
+		                 end_sample,
+		                 start_sample,
+#ifdef __WORK_STEALING__
+		                 work_pool_wgs,
+		                 num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+		                 debugdata_coop,
+#endif
+		                 num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_shader_eval,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                per_sample_output_buffers,
+		                rng_coop,
+		                throughput_coop,
+		                L_transparent_coop,
+		                PathRadiance_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                AOAlpha_coop,
+		                AOBSDF_coop,
+		                AOLightRay_coop,
+		                d_w,
+		                d_h,
+		                d_x,
+		                d_y,
+		                d_stride,
+		                ray_state,
+		                work_array,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+#ifdef __WORK_STEALING__
+		                start_sample,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_direct_lighting,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                sd_DL_shadow,
+		                rng_coop,
+		                PathState_coop,
+		                ISLamp_coop,
+		                LightRay_coop,
+		                BSDFEval_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_shadow_blocked,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd_DL_shadow,
+		                PathState_coop,
+		                LightRay_coop,
+		                AOLightRay_coop,
+		                Intersection_coop_AO,
+		                Intersection_coop_DL,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                total_num_rays);
+
+		kernel_set_args(ckPathTraceKernel_next_iteration_setup,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                LightRay_coop,
+		                ISLamp_coop,
+		                BSDFEval_coop,
+		                AOLightRay_coop,
+		                AOBSDF_coop,
+		                AOAlpha_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag);
+
+		kernel_set_args(ckPathTraceKernel_sum_all_radiance,
+		                0,
+		                d_data,
+		                d_buffer,
+		                per_sample_output_buffers,
+		                num_parallel_samples,
+		                d_w,
+		                d_h,
+		                d_stride,
+		                rtile.buffer_offset_x,
+		                rtile.buffer_offset_y,
+		                rtile.buffer_rng_state_stride,
+		                start_sample);
+
+		/* Macro for Enqueuing split kernels. */
+#define GLUE(a, b) a ## b
+#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
+		{ \
+			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
+			                               GLUE(ckPathTraceKernel_, \
+			                                    kernelName), \
+			                               2, \
+			                               NULL, \
+			                               globalSize, \
+			                               localSize, \
+			                               0, \
+			                               NULL, \
+			                               NULL); \
+			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
+			if(ciErr != CL_SUCCESS) { \
+				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
+				                               clewErrorString(ciErr)); \
+				opencl_error(message); \
+				return; \
+			} \
+		} (void) 0
+
+		/* Enqueue ckPathTraceKernel_data_init kernel. */
+		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
+		bool activeRaysAvailable = true;
+
+		/* Record number of time host intervention has been made */
+		unsigned int numHostIntervention = 0;
+		unsigned int numNextPathIterTimes = PathIteration_times;
+		while(activeRaysAvailable) {
+			/* Twice the global work size of other kernels for
+			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+			size_t global_size_shadow_blocked[2];
+			global_size_shadow_blocked[0] = global_size[0] * 2;
+			global_size_shadow_blocked[1] = global_size[1];
+
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+			}
+
+			/* Read ray-state into Host memory to decide if we should exit
+			 * path-iteration in host.
+			 */
+			ciErr = clEnqueueReadBuffer(cqCommandQueue,
+			                            ray_state,
+			                            CL_TRUE,
+			                            0,
+			                            global_size[0] * global_size[1] * sizeof(char),
+			                            hostRayStateArray,
+			                            0,
+			                            NULL,
+			                            NULL);
+			assert(ciErr == CL_SUCCESS);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0;
+			    rayStateIter < global_size[0] * global_size[1];
+			    ++rayStateIter)
+			{
+				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(activeRaysAvailable) {
+				numHostIntervention++;
+				PathIteration_times = PATH_ITER_INC_FACTOR;
+				/* Host intervention done before all rays become RAY_INACTIVE;
+				 * Set do more initial iterations for the next tile.
+				 */
+				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
+			}
+		}
+
+		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
+		 * per_sample_output_buffers into RenderTile's output buffer.
+		 */
+		size_t sum_all_radiance_local_size[2] = {16, 16};
+		size_t sum_all_radiance_global_size[2];
+		sum_all_radiance_global_size[0] =
+			(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
+			sum_all_radiance_local_size[0];
+		sum_all_radiance_global_size[1] =
+			(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
+			sum_all_radiance_local_size[1];
+		ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
+		                     sum_all_radiance_global_size,
+		                     sum_all_radiance_local_size);
+
+#undef ENQUEUE_SPLIT_KERNEL
+#undef GLUE
+
+		if(numHostIntervention == 0) {
+			/* This means that we are executing kernel more than required
+			 * Must avoid this for the next sample/tile.
+			 */
+			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
+			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+		}
+		else {
+			/* Number of path-iterations done for this tile is set as
+			 * Initial path-iteration times for the next tile
+			 */
+			PathIteration_times = numNextPathIterTimes;
+		}
+
+		first_tile = false;
+	}
+
+	/* Calculates the amount of memory that has to be always
+	 * allocated in order for the split kernel to function.
+	 * This memory is tile/scene-property invariant (meaning,
+	 * the value returned by this function does not depend
+	 * on the user set tile size or scene properties.
+	 */
+	size_t get_invariable_mem_allocated()
+	{
+		size_t total_invariable_mem_allocated = 0;
+		size_t KernelGlobals_size = 0;
+		size_t ShaderData_SOA_size = 0;
+
+		KernelGlobals_size = get_KernelGlobals_size();
+		ShaderData_SOA_size = get_shaderdata_soa_size();
+
+		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
+		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
+		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
+		total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */
+		total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */
+
+		return total_invariable_mem_allocated;
+	}
+
+	/* Calculate the memory that has-to-be/has-been allocated for
+	 * the split kernel to function.
+	 */
+	size_t get_tile_specific_mem_allocated(const int2 tile_size)
+	{
+		size_t tile_specific_mem_allocated = 0;
+
+		/* Get required tile info */
+		unsigned int user_set_tile_w = tile_size.x;
+		unsigned int user_set_tile_h = tile_size.y;
+
+#ifdef __WORK_STEALING__
+		/* Calculate memory to be allocated for work_pools in
+		 * case of work_stealing.
+		 */
+		size_t max_global_size[2];
+		size_t max_num_work_pools = 0;
+		max_global_size[0] =
+			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		max_global_size[1] =
+			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		max_num_work_pools =
+			(max_global_size[0] * max_global_size[1]) /
+			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
+		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
+#endif
+
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * sizeof(RNG);
+
+		return tile_specific_mem_allocated;
+	}
+
+	/* Calculates the texture memories and KernelData (d_data) memory
+	 * that has been allocated.
+	 */
+	size_t get_scene_specific_mem_allocated(cl_mem d_data)
+	{
+		size_t scene_specific_mem_allocated = 0;
+		/* Calculate texture memories. */
+#define KERNEL_TEX(type, ttype, name) \
+	scene_specific_mem_allocated += get_tex_size(#name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+		size_t d_data_size;
+		ciErr = clGetMemObjectInfo(d_data,
+		                           CL_MEM_SIZE,
+		                           sizeof(d_data_size),
+		                           &d_data_size,
+		                           NULL);
+		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
+		scene_specific_mem_allocated += d_data_size;
+		return scene_specific_mem_allocated;
+	}
+
+	/* Calculate the memory required for one thread in split kernel. */
+	size_t get_per_thread_memory()
+	{
+		size_t shader_closure_size = 0;
+		size_t shaderdata_volume = 0;
+		shader_closure_size = get_shader_closure_size(current_max_closure);
+		/* TODO(sergey): This will actually over-allocate if
+		 * particular kernel does not support multiclosure.
+		 */
+		shaderdata_volume = get_shader_data_size(shader_closure_size);
+		size_t retval = sizeof(RNG)
+			+ sizeof(float3)          /* Throughput size */
+			+ sizeof(float)           /* L transparent size */
+			+ sizeof(char)            /* Ray state size */
+			+ sizeof(unsigned int)    /* Work element size */
+			+ sizeof(int)             /* ISLamp_size */
+			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+			+ sizeof(Intersection)    /* Overall isect */
+			+ sizeof(Intersection)    /* Instersection_coop_AO */
+			+ sizeof(Intersection)    /* Intersection coop DL */
+			+ shaderdata_volume       /* Overall ShaderData */
+			+ (shaderdata_volume * 2) /* ShaderData : DL and shadow */
+			+ sizeof(Ray) + sizeof(BsdfEval)
+			+ sizeof(float3)          /* AOAlpha size */
+			+ sizeof(float3)          /* AOBSDF size */
+			+ sizeof(Ray)
+			+ (sizeof(int) * NUM_QUEUES)
+			+ per_thread_output_buffer_size;
+		return retval;
+	}
+
+	/* Considers the total memory available in the device and
+	 * and returns the maximum global work size possible.
+	 */
+	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
+	{
+		/* Calculate invariably allocated memory. */
+		size_t invariable_mem_allocated = get_invariable_mem_allocated();
+		/* Calculate tile specific allocated memory. */
+		size_t tile_specific_mem_allocated =
+			get_tile_specific_mem_allocated(tile_size);
+		/* Calculate scene specific allocated memory. */
+		size_t scene_specific_mem_allocated =
+			get_scene_specific_mem_allocated(d_data);
+		/* Calculate total memory available for the threads in global work size. */
+		size_t available_memory = total_allocatable_memory
+			- invariable_mem_allocated
+			- tile_specific_mem_allocated
+			- scene_specific_mem_allocated
+			- DATA_ALLOCATION_MEM_FACTOR;
+		size_t per_thread_memory_required = get_per_thread_memory();
+		return (available_memory / per_thread_memory_required);
+	}
+
+	/* Checks if the device has enough memory to render the whole tile;
+	 * If not, we should split single tile into multiple tiles of small size
+	 * and process them all.
+	 */
+	bool need_to_split_tile(unsigned int d_w,
+	                        unsigned int d_h,
+	                        int2 max_render_feasible_tile_size)
+	{
+		size_t global_size_estimate[2];
+		/* TODO(sergey): Such round-ups are in quite few places, need to replace
+		 * them with an utility macro.
+		 */
+		global_size_estimate[0] =
+			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		global_size_estimate[1] =
+			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if((global_size_estimate[0] * global_size_estimate[1]) >
+		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
 		{
-			run = function_bind(&OpenCLDevice::thread_run, device, this);
+			return true;
 		}
-	};
+		else {
+			return false;
+		}
+	}
 
-	int get_split_task_count(DeviceTask& task)
+	/* Considers the scene properties, global memory available in the device
+	 * and returns a rectanglular tile dimension (approx the maximum)
+	 * that should render on split kernel.
+	 */
+	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
 	{
-		return 1;
+		int2 max_render_feasible_tile_size;
+		int square_root_val = (int)sqrt(feasible_global_work_size);
+		max_render_feasible_tile_size.x = square_root_val;
+		max_render_feasible_tile_size.y = square_root_val;
+		/* Ciel round-off max_render_feasible_tile_size. */
+		int2 ceil_render_feasible_tile_size;
+		ceil_render_feasible_tile_size.x =
+			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		ceil_render_feasible_tile_size.y =
+			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
+		   feasible_global_work_size)
+		{
+			return ceil_render_feasible_tile_size;
+		}
+		/* Floor round-off max_render_feasible_tile_size. */
+		int2 floor_render_feasible_tile_size;
+		floor_render_feasible_tile_size.x =
+			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		floor_render_feasible_tile_size.y =
+			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		return floor_render_feasible_tile_size;
 	}
 
-	void task_add(DeviceTask& task)
+	/* Try splitting the current tile into multiple smaller
+	 * almost-square-tiles.
+	 */
+	int2 get_split_tile_size(RenderTile rtile,
+	                         int2 max_render_feasible_tile_size)
 	{
-		task_pool.push(new OpenCLDeviceTask(this, task));
+		int2 split_tile_size;
+		int num_global_threads = max_render_feasible_tile_size.x *
+		                         max_render_feasible_tile_size.y;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		/* Ceil round off d_w and d_h */
+		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		while(d_w * d_h > num_global_threads) {
+			/* Halve the longer dimension. */
+			if(d_w >= d_h) {
+				d_w = d_w / 2;
+				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_X;
+			}
+			else {
+				d_h = d_h / 2;
+				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_Y;
+			}
+		}
+		split_tile_size.x = d_w;
+		split_tile_size.y = d_h;
+		return split_tile_size;
 	}
 
-	void task_wait()
+	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
+	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
 	{
-		task_pool.wait();
+		vector<SplitRenderTile> to_path_trace_rtile;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
+		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
+		/* Buffer and rng_state offset calc. */
+		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
+		size_t offset_x = offset_index % rtile.stride;
+		size_t offset_y = offset_index / rtile.stride;
+		/* Resize to_path_trace_rtile. */
+		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
+		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
+			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
+				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
+				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
+				to_path_trace_rtile[rtile_index].sample = rtile.sample;
+				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
+				to_path_trace_rtile[rtile_index].offset = rtile.offset;
+				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
+				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
+				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
+				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
+				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
+				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
+				/* Fill width and height of the new render tile. */
+				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
+					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
+					: split_tile_size.x;
+				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
+					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
+					: split_tile_size.y;
+				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
+			}
+		}
+		return to_path_trace_rtile;
 	}
 
-	void task_cancel()
+	void thread_run(DeviceTask *task)
 	{
-		task_pool.cancel();
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+			bool initialize_data_and_check_render_feasibility = false;
+			bool need_to_split_tiles_further = false;
+			int2 max_render_feasible_tile_size;
+			size_t feasible_global_work_size;
+			const int2 tile_size = task->requested_tile_size;
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				if(!initialize_data_and_check_render_feasibility) {
+					/* Initialize data. */
+					/* Calculate per_thread_output_buffer_size. */
+					size_t output_buffer_size = 0;
+					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
+					                           CL_MEM_SIZE,
+					                           sizeof(output_buffer_size),
+					                           &output_buffer_size,
+					                           NULL);
+					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
+					/* This value is different when running on AMD and NV. */
+					if(background) {
+						/* In offline render the number of buffer elements
+						 * associated with tile.buffer is the current tile size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.w * tile.h);
+					}
+					else {
+						/* interactive rendering, unlike offline render, the number of buffer elements
+						 * associated with tile.buffer is the entire viewport size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.buffers->params.width *
+							                      tile.buffers->params.height);
+					}
+					/* Check render feasibility. */
+					feasible_global_work_size = get_feasible_global_work_size(
+						tile_size,
+						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
+					max_render_feasible_tile_size =
+						get_max_render_feasible_tile_size(
+							feasible_global_work_size);
+					need_to_split_tiles_further =
+						need_to_split_tile(tile_size.x,
+						                   tile_size.y,
+						                   max_render_feasible_tile_size);
+					initialize_data_and_check_render_feasibility = true;
+				}
+				if(need_to_split_tiles_further) {
+					int2 split_tile_size =
+						get_split_tile_size(tile,
+						                    max_render_feasible_tile_size);
+					vector<SplitRenderTile> to_path_trace_render_tiles =
+						split_tiles(tile, split_tile_size);
+					/* Print message to console */
+					if(background && (to_path_trace_render_tiles.size() > 1)) {
+						fprintf(stderr, "Message : Tiles need to be split "
+						        "further inside path trace (due to insufficient "
+						        "device-global-memory for split kernel to "
+						        "function) \n"
+						        "The current tile of dimensions %dx%d is split "
+						        "into tiles of dimension %dx%d for render \n",
+						        tile.w, tile.h,
+						        split_tile_size.x,
+						        split_tile_size.y);
+					}
+					/* Process all split tiles. */
+					for(int tile_iter = 0;
+					    tile_iter < to_path_trace_render_tiles.size();
+					    ++tile_iter)
+					{
+						path_trace(to_path_trace_render_tiles[tile_iter],
+						           max_render_feasible_tile_size);
+					}
+				}
+				else {
+					/* No splitting required; process the entire tile at once. */
+					/* Render feasible tile size is user-set-tile-size itself. */
+					max_render_feasible_tile_size.x =
+						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_X;
+					max_render_feasible_tile_size.y =
+						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_Y;
+					/* buffer_rng_state_stride is stride itself. */
+					SplitRenderTile split_tile(tile);
+					split_tile.buffer_rng_state_stride = tile.stride;
+					path_trace(split_tile, max_render_feasible_tile_size);
+				}
+				tile.sample = tile.start_sample + tile.num_samples;
+
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
+				task->release_tile(tile);
+			}
+		}
+	}
+
+protected:
+	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+	{
+		cl_mem ptr;
+		assert(bufsize != 0);
+		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
+		opencl_assert_err(ciErr, "clCreateBuffer");
+		return ptr;
+	}
+
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	cl_program load_cached_kernel(
+	        const DeviceRequestedFeatures& /*requested_features*/,
+	        OpenCLCache::ProgramName /*program_name*/,
+	        thread_scoped_lock /*cache_locker*/)
+	{
+		VLOG(2) << "Skip loading kernel from cache, "
+		        << "not supported by split kernel.";
+		return NULL;
+	}
+
+	void store_cached_kernel(cl_platform_id /*platform*/,
+	                         cl_device_id /*device*/,
+	                         cl_program /*program*/,
+	                         OpenCLCache::ProgramName /*program_name*/,
+	                         thread_scoped_lock& /*slot_locker*/)
+	{
+		VLOG(2) << "Skip storing kernel in cache, "
+		        << "not supported by split kernel.";
+	}
+
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		return build_options_from_requested_features(requested_features);
 	}
 };
 
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 {
-	return new OpenCLDevice(info, stats, background);
+	vector<OpenCLPlatformDevice> usable_devices;
+	opencl_get_usable_devices(&usable_devices);
+	assert(info.num < usable_devices.size());
+	const OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+	const string& platform_name = platform_device.platform_name;
+	const cl_device_type device_type = platform_device.device_type;
+	if(opencl_kernel_use_split(platform_name, device_type)) {
+		VLOG(1) << "Using split kernel.";
+		return new OpenCLDeviceSplitKernel(info, stats, background);
+	} else {
+		VLOG(1) << "Using mega kernel.";
+		return new OpenCLDeviceMegaKernel(info, stats, background);
+	}
 }
 
-bool device_opencl_init(void) {
+bool device_opencl_init(void)
+{
 	static bool initialized = false;
 	static bool result = false;
 
-	if (initialized)
+	if(initialized)
 		return result;
 
 	initialized = true;
 
-	// OpenCL disabled for now, only works with this environment variable set
-	if(!getenv("CYCLES_OPENCL_TEST")) {
-		result = false;
+	if(opencl_device_type() != 0) {
+		int clew_result = clewInit();
+		if(clew_result == CLEW_SUCCESS) {
+			VLOG(1) << "CLEW initialization succeeded.";
+			result = true;
+		}
+		else {
+			VLOG(1) << "CLEW initialization failed: "
+			        << ((clew_result == CLEW_ERROR_ATEXIT_FAILED)
+			            ? "Error setting up atexit() handler"
+			            : "Error opening the library");
+		}
 	}
 	else {
-		result = clewInit() == CLEW_SUCCESS;
+		VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
+		result = false;
 	}
 
 	return result;
@@ -1136,62 +3634,108 @@ bool device_opencl_init(void) {
 
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
-	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
-	vector<cl_platform_id> platform_ids;
+	vector<OpenCLPlatformDevice> usable_devices;
+	opencl_get_usable_devices(&usable_devices);
+	/* Devices are numbered consecutively across platforms. */
+	int num_devices = 0;
+	foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
+		const string& platform_name = platform_device.platform_name;
+		const cl_device_type device_type = platform_device.device_type;
+		const string& device_name = platform_device.device_name;
+		DeviceInfo info;
+		info.type = DEVICE_OPENCL;
+		info.description = string_remove_trademark(string(device_name));
+		info.num = num_devices;
+		info.id = string_printf("OPENCL_%d", info.num);
+		/* We don't know if it's used for display, but assume it is. */
+		info.display_device = true;
+		info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
+		info.pack_images = true;
+		info.use_split_kernel = opencl_kernel_use_split(platform_name,
+		                                                device_type);
+		devices.push_back(info);
+		num_devices++;
+	}
+}
+
+string device_opencl_capabilities(void)
+{
+	if(opencl_device_type() == 0) {
+		return "All OpenCL devices are forced to be OFF";
+	}
+	string result = "";
+	string error_msg = "";  /* Only used by opencl_assert(), but in the future
+	                         * it could also be nicely reported to the console.
+	                         */
 	cl_uint num_platforms = 0;
+	opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms));
+	if(num_platforms == 0) {
+		return "No OpenCL platforms found\n";
+	}
+	result += string_printf("Number of platforms: %u\n", num_platforms);
 
-	/* get devices */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
-		return;
-	
+	vector<cl_platform_id> platform_ids;
 	platform_ids.resize(num_platforms);
+	opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
+
+#define APPEND_STRING_INFO(func, id, name, what) \
+	do { \
+		char data[1024] = "\0"; \
+		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
+		result += string_printf("%s: %s\n", name, data); \
+	} while(false)
+#define APPEND_PLATFORM_STRING_INFO(id, name, what) \
+	APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
+#define APPEND_DEVICE_STRING_INFO(id, name, what) \
+	APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
 
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS)
-		return;
-
-	/* devices are numbered consecutively across platforms */
-	int num_base = 0;
-
-	for (int platform = 0; platform < num_platforms; platform++, num_base += num_devices) {
-		num_devices = 0;
-		if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), 0, NULL, &num_devices) != CL_SUCCESS || num_devices == 0)
-			continue;
-
-		device_ids.resize(num_devices);
-
-		if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), num_devices, &device_ids[0], NULL) != CL_SUCCESS)
-			continue;
-
-		char pname[256];
-		clGetPlatformInfo(platform_ids[platform], CL_PLATFORM_NAME, sizeof(pname), &pname, NULL);
-		string platform_name = pname;
-
-		/* add devices */
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char name[1024] = "\0";
+	vector<cl_device_id> device_ids;
+	for (cl_uint platform = 0; platform < num_platforms; ++platform) {
+		cl_platform_id platform_id = platform_ids[platform];
 
-			if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
-				continue;
+		result += string_printf("Platform #%u\n", platform);
 
-			DeviceInfo info;
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
 
-			info.type = DEVICE_OPENCL;
-			info.description = string(name);
-			info.num = num_base + num;
-			info.id = string_printf("OPENCL_%d", info.num);
-			/* we don't know if it's used for display, but assume it is */
-			info.display_device = true;
-			info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
-			info.pack_images = true;
+		cl_uint num_devices = 0;
+		opencl_assert(clGetDeviceIDs(platform_ids[platform],
+		                             CL_DEVICE_TYPE_ALL,
+		                             0,
+		                             NULL,
+		                             &num_devices));
+		result += string_printf("\tNumber of devices: %u\n", num_devices);
 
-			devices.push_back(info);
+		device_ids.resize(num_devices);
+		opencl_assert(clGetDeviceIDs(platform_ids[platform],
+		                             CL_DEVICE_TYPE_ALL,
+		                             num_devices,
+		                             &device_ids[0],
+		                             NULL));
+		for (cl_uint device = 0; device < num_devices; ++device) {
+			cl_device_id device_id = device_ids[device];
+
+			result += string_printf("\t\tDevice: #%u\n", device);
+
+			APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+			APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
+			APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
+			APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
+			APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
+			APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
 		}
 	}
+
+#undef APPEND_STRING_INFO
+#undef APPEND_PLATFORM_STRING_INFO
+#undef APPEND_DEVICE_STRING_INFO
+
+	return result;
 }
 
 CCL_NAMESPACE_END
 
 #endif /* WITH_OPENCL */
-
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index dc124f8cf37..d527540f300 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -111,7 +111,7 @@ void DeviceTask::update_progress(RenderTile *rtile)
 	if(update_tile_sample) {
 		double current_time = time_dt();
 
-		if (current_time - last_update_time >= 1.0) {
+		if(current_time - last_update_time >= 1.0) {
 			update_tile_sample(*rtile);
 
 			last_update_time = current_time;
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 50216adefe2..834ea60988a 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __DEVICE_TASK_H__
@@ -57,14 +57,15 @@ public:
 
 	void update_progress(RenderTile *rtile);
 
-	boost::function<bool(Device *device, RenderTile&)> acquire_tile;
-	boost::function<void(void)> update_progress_sample;
-	boost::function<void(RenderTile&)> update_tile_sample;
-	boost::function<void(RenderTile&)> release_tile;
-	boost::function<bool(void)> get_cancel;
+	function<bool(Device *device, RenderTile&)> acquire_tile;
+	function<void(void)> update_progress_sample;
+	function<void(RenderTile&)> update_tile_sample;
+	function<void(RenderTile&)> release_tile;
+	function<bool(void)> get_cancel;
 
 	bool need_finish_queue;
 	bool integrator_branched;
+	int2 requested_tile_size;
 protected:
 	double last_update_time;
 };
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8857f86890c..b44e91751ad 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,3 +1,4 @@
+remove_extra_strict_flags()
 
 set(INC
 	.
@@ -11,9 +12,20 @@ set(INC_SYS
 )
 
 set(SRC
-	kernel.cpp
-	kernel.cl
-	kernel.cu
+	kernels/cpu/kernel.cpp
+	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_queue_enqueue.cl
+	kernels/opencl/kernel_scene_intersect.cl
+	kernels/opencl/kernel_lamp_emission.cl
+	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_shader_eval.cl
+	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_direct_lighting.cl
+	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_next_iteration_setup.cl
+	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/cuda/kernel.cu
 )
 
 set(SRC_HEADERS
@@ -24,6 +36,7 @@ set(SRC_HEADERS
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
+	kernel_debug.h
 	kernel_differential.h
 	kernel_emission.h
 	kernel_film.h
@@ -34,17 +47,22 @@ set(SRC_HEADERS
 	kernel_montecarlo.h
 	kernel_passes.h
 	kernel_path.h
+	kernel_path_branched.h
+	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
 	kernel_path_volume.h
 	kernel_projection.h
+	kernel_queues.h
 	kernel_random.h
 	kernel_shader.h
+	kernel_shaderdata_vars.h
 	kernel_shadow.h
 	kernel_subsurface.h
 	kernel_textures.h
 	kernel_types.h
 	kernel_volume.h
+	kernel_work_stealing.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -61,12 +79,12 @@ set(SRC_CLOSURE_HEADERS
 	closure/bsdf_transparent.h
 	closure/bsdf_util.h
 	closure/bsdf_ashikhmin_shirley.h
-	closure/bsdf_westin.h
 	closure/bsdf_hair.h
 	closure/bssrdf.h
 	closure/emissive.h
 	closure/volume.h
 )
+
 set(SRC_SVM_HEADERS
 	svm/svm.h
 	svm/svm_attribute.h
@@ -91,6 +109,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_magic.h
 	svm/svm_mapping.h
 	svm/svm_math.h
+	svm/svm_math_util.h
 	svm/svm_mix.h
 	svm/svm_musgrave.h
 	svm/svm_noise.h
@@ -106,6 +125,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_value.h
 	svm/svm_vector_transform.h
 	svm/svm_voronoi.h
+	svm/svm_voxel.h
 	svm/svm_wave.h
 )
 
@@ -116,22 +136,48 @@ set(SRC_GEOM_HEADERS
 	geom/geom_bvh_shadow.h
 	geom/geom_bvh_subsurface.h
 	geom/geom_bvh_traversal.h
+	geom/geom_bvh_volume.h
+	geom/geom_bvh_volume_all.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
 	geom/geom_primitive.h
+	geom/geom_qbvh.h
+	geom/geom_qbvh_shadow.h
+	geom/geom_qbvh_subsurface.h
+	geom/geom_qbvh_traversal.h
+	geom/geom_qbvh_volume.h
+	geom/geom_qbvh_volume_all.h
 	geom/geom_triangle.h
+	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
 )
 
 set(SRC_UTIL_HEADERS
+	../util/util_atomic.h
 	../util/util_color.h
 	../util/util_half.h
 	../util/util_math.h
+	../util/util_math_fast.h
 	../util/util_transform.h
 	../util/util_types.h
 )
+
+set(SRC_SPLIT_HEADERS
+	split/kernel_background_buffer_update.h
+	split/kernel_data_init.h
+	split/kernel_direct_lighting.h
+	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_lamp_emission.h
+	split/kernel_next_iteration_setup.h
+	split/kernel_scene_intersect.h
+	split/kernel_shader_eval.h
+	split/kernel_shadow_blocked.h
+	split/kernel_split_common.h
+	split/kernel_sum_all_radiance.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -143,7 +189,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# CUDA version
-	execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+	execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
 	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
 	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
@@ -157,18 +203,24 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
 		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
 			set(cuda_cubin kernel_experimental_${arch}.cubin)
 		else()
 			set(cuda_extra_flags "")
 			set(cuda_cubin kernel_${arch}.cubin)
 		endif()
 
+		if(WITH_CYCLES_DEBUG)
+			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
+		else()
+			set(cuda_debug_flags "")
+		endif()
+
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")
 
@@ -177,13 +229,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			COMMAND ${CUDA_NVCC_EXECUTABLE}
 					-arch=${arch}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
 					${cuda_extra_flags}
+					${cuda_debug_flags}
 					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
 					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
 					-DCCL_NAMESPACE_BEGIN=
@@ -196,6 +249,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		list(APPEND cuda_cubins ${cuda_cubin})
 
 		unset(cuda_extra_flags)
+		unset(cuda_debug_flags)
 	endmacro()
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
@@ -223,20 +277,29 @@ include_directories(SYSTEM ${INC_SYS})
 
 if(CXX_HAS_SSE)
 	list(APPEND SRC
-		kernel_sse2.cpp
-		kernel_sse3.cpp
-		kernel_sse41.cpp
-		kernel_avx.cpp
-		kernel_avx2.cpp
+		kernels/cpu/kernel_sse2.cpp
+		kernels/cpu/kernel_sse3.cpp
+		kernels/cpu/kernel_sse41.cpp
 	)
 
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
+if(CXX_HAS_AVX)
+	list(APPEND SRC
+		kernels/cpu/kernel_avx.cpp
+	)
+	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+endif()
+
+if(CXX_HAS_AVX2)
+	list(APPEND SRC
+		kernels/cpu/kernel_avx2.cpp
+	)
+	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
 
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
 
@@ -254,11 +317,23 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 5a9e57c5342..e8d51013924 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -57,8 +57,9 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     build_dir = os.path.join(root_build_dir, 'intern/cycles/kernel')
 
     # source directories and files
+    kernel_file_rel = os.path.join("kernels", "cuda", "kernel.cu")
     source_dir = Dir('.').srcnode().path
-    kernel_file = os.path.join(source_dir, "kernel.cu")
+    kernel_file = os.path.join(source_dir, kernel_file_rel)
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
     geom_dir = os.path.join(source_dir, "../geom")
@@ -79,12 +80,15 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
 
+    if env['WITH_BF_CYCLES_DEBUG']:
+        nvcc_flags += " -D__KERNEL_DEBUG__"
+
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
+    dependencies = [kernel_file_rel] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     configs = (("kernel_%s.cubin", ''),
-               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_EXPERIMENTAL__'))
 
     # add command for each cuda architecture
     for arch in cuda_archs:
@@ -102,7 +106,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
             else:
                 command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
 
-            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Command(cubin_file, kernel_file_rel, command)
             kernel.Depends(cubin_file, dependencies)
 
             kernel_binaries.append(cubin_file)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 9961071c2ac..558aa0dc6a9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "../closure/bsdf_ashikhmin_velvet.h"
@@ -24,7 +24,6 @@
 #include "../closure/bsdf_refraction.h"
 #include "../closure/bsdf_transparent.h"
 #include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_westin.h"
 #include "../closure/bsdf_toon.h"
 #include "../closure/bsdf_hair.h"
 #ifdef __SUBSURFACE__
@@ -48,87 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;*/
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -148,73 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
 #endif
 
-	if(dot(sd->Ng, omega_in) >= 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;*/
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -226,63 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -296,6 +275,8 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 
 ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
+/* ToDo: do we want to blur volume closures? */
+
 #ifdef __OSL__
 	if(kg->osl && sc->prim) {
 		OSLShader::bsdf_blur(sc, roughness);
@@ -303,33 +284,8 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 	}
 #endif
 
-	switch(sc->type) {
-		case CLOSURE_BSDF_DIFFUSE_ID:
-		case CLOSURE_BSDF_BSSRDF_ID:
-			bsdf_diffuse_blur(sc, roughness);
-			break;
 #ifdef __SVM__
-		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			bsdf_oren_nayar_blur(sc, roughness);
-			break;
-		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			bsdf_phong_ramp_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			bsdf_diffuse_ramp_blur(sc, roughness);
-			break;*/
-		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			bsdf_translucent_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_REFLECTION_ID:
-			bsdf_reflection_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_REFRACTION_ID:
-			bsdf_refraction_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_TRANSPARENT_ID:
-			bsdf_transparent_blur(sc, roughness);
-			break;
+	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
@@ -344,30 +300,10 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 			bsdf_ashikhmin_shirley_blur(sc, roughness);
 			break;
-		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			bsdf_ashikhmin_velvet_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			bsdf_diffuse_toon_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			bsdf_glossy_toon_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			bsdf_westin_backscatter_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			bsdf_westin_sheen_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			bsdf_hair_reflection_blur(sc, roughness);
-			break;
-#endif
-		/* todo: do we want to blur volume closures? */
 		default:
 			break;
 	}
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index ad7864cb8ea..8d7d533d6f8 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2014 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
@@ -33,24 +33,20 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = sc->data0;
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
@@ -73,7 +69,10 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 
 	float out = 0.0f;
 
-	if (NdotI > 0.0f && NdotO > 0.0f) {
+	if(fmaxf(sc->data0, sc->data1) <= 1e-4f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	if(NdotI > 0.0f && NdotO > 0.0f) {
 		NdotI = fmaxf(NdotI, 1e-6f);
 		NdotO = fmaxf(NdotO, 1e-6f);
 		float3 H = normalize(omega_in + I);
@@ -86,7 +85,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
 
-		if (n_x == n_y) {  /* => isotropic case */
+		if(n_x == n_y) {
+			/* isotropic */
 			float e = n_x;
 			float lobe = powf(HdotN, e);
 			float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
@@ -94,7 +94,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 			out = NdotO * norm * lobe * pump;
 			*pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
 		}
-		else {             /* => ANisotropic case */
+		else {
+			/* anisotropic */
 			float3 X, Y;
 			make_orthonormals_tangent(N, sc->T, &X, &Y);
 
@@ -130,7 +131,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 	float3 N = sc->N;
 
 	float NdotI = dot(N, I);
-	if (NdotI > 0.0f) {
+	if(NdotI > 0.0f) {
 
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
@@ -146,21 +147,23 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* sample spherical coords for h in tangent space */
 		float phi;
 		float cos_theta;
-		if (n_x == n_y) {  /* => simple isotropic sampling */
+		if(n_x == n_y) {
+			/* isotropic sampling */
 			phi = M_2PI_F * randu;
 			cos_theta = powf(randv, 1.0f / (n_x + 1.0f));
 		}
-		else {             /* => more complex anisotropic sampling */
-			if (randu < 0.25f) {      /* first quadrant */
+		else {
+			/* anisotropic sampling */
+			if(randu < 0.25f) {      /* first quadrant */
 				float remapped_randu = 4.0f * randu;
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 			}
-			else if (randu < 0.5f) {  /* second quadrant */
+			else if(randu < 0.5f) {  /* second quadrant */
 				float remapped_randu = 4.0f * (.5f - randu);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F - phi;
 			}
-			else if (randu < 0.75f) { /* third quadrant */
+			else if(randu < 0.75f) { /* third quadrant */
 				float remapped_randu = 4.0f * (randu - 0.5f);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F + phi;
@@ -185,14 +188,20 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* half vector to world space */
 		float3 H = h.x*X + h.y*Y + h.z*N;
 		float HdotI = dot(H, I);
-		if (HdotI < 0.0f) H = -H;
+		if(HdotI < 0.0f) H = -H;
 
 		/* reflect I on H to get omega_in */
 		*omega_in = -I + (2.0f * HdotI) * H;
 
-		/* leave the rest to eval_reflect */
-		/* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */
-		*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		if(fmaxf(sc->data0, sc->data1) <= 1e-4f) {
+			/* Some high number for MIS. */
+			*pdf = 1e6f;
+			*eval = make_float3(1e6f, 1e6f, 1e6f);
+		}
+		else {
+			/* leave the rest to eval_reflect */
+			*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		}
 
 #ifdef __RAY_DIFFERENTIALS__
 		/* just do the reflection thing for now */
@@ -201,7 +210,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}
 
-	return LABEL_REFLECT | LABEL_GLOSSY;
+	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3631f90bf8c..f1a26650078 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -45,10 +45,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_ashikhmin_velvet_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_invsigma2 = sc->data0;
@@ -63,7 +59,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		float cosHO = fabsf(dot(I, H));
 
 		if(!(fabsf(cosNH) < 1.0f-1e-5f && cosHO > 1e-5f))
-			return make_float3(0, 0, 0);
+			return make_float3(0.0f, 0.0f, 0.0f);
 
 		float cosNHdivHO = cosNH / cosHO;
 		cosNHdivHO = fmaxf(cosNHdivHO, 1e-5f);
@@ -84,7 +80,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -118,7 +114,7 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng,
 
 			float sinNH2 = 1 - cosNH * cosNH;
 			float sinNH4 = sinNH2 * sinNH2;
-			float cotangent2 =  (cosNH * cosNH) / sinNH2;
+			float cotangent2 = (cosNH * cosNH) / sinNH2;
 
 			float D = expf(-cotangent2 * m_invsigma2) * m_invsigma2 * M_1_PI_F / sinNH4;
 			float G = min(1.0f, min(fac1, fac2)); // TODO: derive G from D analytically
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 949fe869549..4b29bb096d1 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_diffuse_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_diffuse_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float3 N = sc->N;
@@ -90,10 +86,6 @@ ccl_device int bsdf_translucent_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_translucent_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_translucent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -108,11 +100,6 @@ ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const
 	return make_float3 (cos_pi, cos_pi, cos_pi);
 }
 
-ccl_device float bsdf_translucent_albedo(const ShaderClosure *sc, const float3 I)
-{
-	return 1.0f;
-}
-
 ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index b856774375f..e0287e7655a 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -52,7 +52,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
+	sc->data0 = 0.0f;
+	sc->data1 = 0.0f;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index e0b5454592b..1e81617a7d3 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -36,20 +36,12 @@
 CCL_NAMESPACE_BEGIN
 
 
-ccl_device void bsdf_hair_reflection_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device void bsdf_hair_transmission_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device int bsdf_hair_reflection_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
@@ -57,31 +49,25 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 	sc->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
-	//float3 locx = cross(locy, Tg);
 
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
 	float omega_in_z = dot(Tg, omega_in);
 	float3 omega_in_y = normalize(omega_in - Tg * omega_in_z);
 
-	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
+	float theta_i = M_PI_2_F - fast_acosf(omega_in_z);
 	float cosphi_i = dot(omega_in_y, locy);
 
 	if(M_PI_2_F - fabsf(theta_i) < 0.001f || cosphi_i < 0.0f) {
@@ -89,17 +75,19 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, con
 		return make_float3(*pdf, *pdf, *pdf);
 	}
 
-	float phi_i = safe_acosf(cosphi_i) / roughness2;
+	float roughness1_inv = 1.0f / roughness1;
+	float roughness2_inv = 1.0f / roughness2;
+	float phi_i = fast_acosf(cosphi_i) * roughness2_inv;
 	phi_i = fabsf(phi_i) < M_PI_F ? phi_i : M_PI_F;
-	float costheta_i = cosf(theta_i);
+	float costheta_i = fast_cosf(theta_i);
 
-	float a_R = atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
-	float b_R = atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
+	float a_R = fast_atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
+	float b_R = fast_atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
 
 	float theta_h = (theta_i + theta_r) * 0.5f;
 	float t = theta_h - offset;
 
-	float phi_pdf = cosf(phi_i * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = fast_cosf(phi_i * 0.5f) * 0.25f * roughness2_inv;
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)* costheta_i);
 	*pdf = phi_pdf * theta_pdf;
 
@@ -119,37 +107,32 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co
 
 ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
-	//float3 locx = cross(locy, Tg);
 
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
 	float omega_in_z = dot(Tg, omega_in);
 	float3 omega_in_y = normalize(omega_in - Tg * omega_in_z);
 
-	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
-	float phi_i = safe_acosf(dot(omega_in_y, locy));
+	float theta_i = M_PI_2_F - fast_acosf(omega_in_z);
+	float phi_i = fast_acosf(dot(omega_in_y, locy));
 
 	if(M_PI_2_F - fabsf(theta_i) < 0.001f) {
 		*pdf = 0.0f;
 		return make_float3(*pdf, *pdf, *pdf);
 	}
 
-	float costheta_i = cosf(theta_i);
+	float costheta_i = fast_cosf(theta_i);
 
-	float a_TT = atan2f(((M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float b_TT = atan2f(((-M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float c_TT = 2 * atan2f(M_PI_2_F / roughness2, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_TT = fast_atan2f(((M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float b_TT = fast_atan2f(((-M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float c_TT = 2 * fast_atan2f(M_PI_2_F / roughness2, 1.0f);
 
 	float theta_h = (theta_i + theta_r) / 2;
 	float t = theta_h - offset;
@@ -165,39 +148,38 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
 
 ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
-	float a_R = atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
-	float b_R = atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_R = fast_atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
+	float b_R = fast_atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
 
 	float t = roughness1 * tanf(randu * (a_R - b_R) + b_R);
 
 	float theta_h = t + offset;
 	float theta_i = 2 * theta_h - theta_r;
-	float costheta_i = cosf(theta_i);
-	float sintheta_i = sinf(theta_i);
+
+	float costheta_i, sintheta_i;
+	fast_sincosf(theta_i, &sintheta_i, &costheta_i);
 
 	float phi = 2 * safe_asinf(1 - 2 * randv) * roughness2;
 
-	float phi_pdf = cosf(phi * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = fast_cosf(phi * 0.5f) * 0.25f / roughness2;
 
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)*costheta_i);
 
-	*omega_in =(cosf(phi) * costheta_i) * locy -
-			   (sinf(phi) * costheta_i) * locx +
-			   (            sintheta_i) * Tg;
+	float sinphi, cosphi;
+	fast_sincosf(phi, &sinphi, &cosphi);
+	*omega_in =(cosphi * costheta_i) * locy -
+	           (sinphi * costheta_i) * locx +
+	           (         sintheta_i) * Tg;
 
 	//differentials - TODO: find a better approximation for the reflective bounce
 #ifdef __RAY_DIFFERENTIALS__
@@ -211,48 +193,43 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	if(dot(locy, *omega_in) < 0.0f) {
-		return LABEL_REFLECT|LABEL_TRANSMIT|LABEL_GLOSSY;
-	}
-	
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
-	float a_TT = atan2f(((M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float b_TT = atan2f(((-M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float c_TT = 2 * atan2f(M_PI_2_F / roughness2, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_TT = fast_atan2f(((M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float b_TT = fast_atan2f(((-M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float c_TT = 2 * fast_atan2f(M_PI_2_F / roughness2, 1.0f);
 
 	float t = roughness1 * tanf(randu * (a_TT - b_TT) + b_TT);
 
 	float theta_h = t + offset;
 	float theta_i = 2 * theta_h - theta_r;
-	float costheta_i = cosf(theta_i);
-	float sintheta_i = sinf(theta_i);
+
+	float costheta_i, sintheta_i;
+	fast_sincosf(theta_i, &sintheta_i, &costheta_i);
 
 	float p = roughness2 * tanf(c_TT * (randv - 0.5f));
 	float phi = p + M_PI_F;
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_TT - b_TT) * costheta_i);
 	float phi_pdf = roughness2 / (c_TT * (p * p + roughness2 * roughness2));
 
-	*omega_in =(cosf(phi) * costheta_i) * locy -
-	           (sinf(phi) * costheta_i) * locx +
-	           (            sintheta_i) * Tg;
+	float sinphi, cosphi;
+	fast_sincosf(phi, &sinphi, &cosphi);
+	*omega_in =(cosphi * costheta_i) * locy -
+	           (sinphi * costheta_i) * locx +
+	           (         sintheta_i) * Tg;
 
 	//differentials - TODO: find a better approximation for the transmission bounce
 #ifdef __RAY_DIFFERENTIALS__
@@ -267,10 +244,9 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	if(dot(locy, *omega_in) < 0.0f)
-		return LABEL_TRANSMIT|LABEL_GLOSSY;
-	
-	return LABEL_GLOSSY;
+	kernel_assert(dot(locy, *omega_in) < 0.0f);
+
+	return LABEL_TRANSMIT|LABEL_GLOSSY;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index a0c59e6cbc0..2a0e8f62e7c 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,145 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Approximate erf and erfinv implementations
- *
- * Adapted from code (C) Copyright John Maddock 2006.
- * Use, modification and distribution are subject to the
- * Boost Software License, Version 1.0. (See accompanying file
- * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */
-
-ccl_device float approx_erff_impl(float z)
-{
-	float result;
-
-	if(z < 0.5f) {
-		if(z < 1e-10f) {
-			if(z == 0) {
-				result = 0;
-			}
-			else {
-				float c = 0.0033791670f;
-				result = z * 1.125f + z * c;
-			}
-		}
-		else {
-			float Y = 1.044948577f;
-
-			float zz = z * z;
-			float num = (((-0.007727583f * zz) + -0.050999073f)*zz + -0.338165134f)*zz + 0.083430589f;
-			float denom = (((0.000370900f * zz) + 0.008585719f)*zz + 0.087522260f)*zz + 0.455004033f;
-			result = z * (Y + num / denom);
-		}
-	}
-	else if(z < 2.5f) {
-		if(z < 1.5f) {
-			float Y = 0.4059357643f;
-			float fz = z - 0.5f;
-
-			float num = (((0.088890036f * fz) + 0.191003695f)*fz + 0.178114665f)*fz + -0.098090592f;
-			float denom = (((0.123850974f * fz) + 0.578052804f)*fz + 1.426280048f)*fz + 1.847590709f;
-
-			result = Y + num / denom;
-			result *= expf(-z * z) / z;
-		}
-		else  {
-			float Y = 0.506728172f;
-			float fz = z - 1.5f;
-			float num = (((0.017567943f * fz) + 0.043948189f)*fz + 0.038654037f)*fz + -0.024350047f;
-			float denom = (((0.325732924f * fz) + 0.982403709f)*fz + 1.539914949f)*fz + 1;
-
-			result = Y + num / denom;
-			result *= expf(-z * z) / z;
-		}
-
-		result = 1 - result;
-	}
-	else {
-		result = 1;
-	}
-
-	return result;
-}
-
-ccl_device float approx_erff(float z)
-{
-	float s = 1.0f;
-
-	if(z < 0.0f) {
-		s = -1.0f;
-		z = -z;
-	}
-
-	return s * approx_erff_impl(z);
-}
-
-ccl_device float approx_erfinvf_impl(float p, float q)
-{
-	float result = 0;
-
-	if(p <= 0.5f) {
-		float Y = 0.089131474f;
-		float g = p * (p + 10);
-		float num = (((-0.012692614f * p) + 0.033480662f)*p + -0.008368748f)*p + -0.000508781f;
-		float denom = (((1.562215583f * p) + -1.565745582f)*p + -0.970005043f)*p + 1.0f;
-		float r = num / denom;
-		result = g * Y + g * r;
-	}
-	else if(q >= 0.25f) {
-		float Y = 2.249481201f;
-		float g = sqrtf(-2 * logf(q));
-		float xs = q - 0.25f;
-		float num = (((17.644729840f * xs) + 8.370503283f)*xs + 0.105264680f)*xs + -0.202433508f;
-		float denom = (((-28.660818049f * xs) + 3.971343795f)*xs + 6.242641248f)*xs + 1.0f;
-		float r = num / denom;
-		result = g / (Y + r);
-	}
-	else {
-		float x = sqrtf(-logf(q));
-
-		if(x < 3) {
-			float Y = 0.807220458f;
-			float xs = x - 1.125f;
-			float num = (((0.387079738f * xs) + 0.117030156f)*xs + -0.163794047f)*xs + -0.131102781f;
-			float denom = (((4.778465929f * xs) + 5.381683457f)*xs + 3.466254072f)*xs + 1.0f;
-			float R = num / denom;
-			result = Y * x + R * x;
-		}
-		else {
-			float Y = 0.939955711f;
-			float xs = x - 3;
-			float num = (((0.009508047f * xs) + 0.018557330f)*xs + -0.002224265f)*xs + -0.035035378f;
-			float denom = (((0.220091105f * xs) + 0.762059164f)*xs + 1.365334981f)*xs + 1.0f;
-			float R = num / denom;
-			result = Y * x + R * x;
-		}
-	}
-
-	return result;
-}
-
-ccl_device float approx_erfinvf(float z)
-{
-	float p, q, s;
-
-	if(z < 0) {
-	  p = -z;
-	  q = 1 - p;
-	  s = -1;
-	}
-	else {
-	  p = z;
-	  q = 1 - z;
-	  s = 1;
-	}
-
-	return s * approx_erfinvf_impl(p, q);
-}
-
-/* Beckmann and GGX microfacet importance sampling from:
- * 
- * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
- * E. Heitz and E. d'Eon, EGSR 2014 */
+/* Beckmann and GGX microfacet importance sampling. */
 
 ccl_device_inline void microfacet_beckmann_sample_slopes(
 	KernelGlobals *kg,
@@ -194,64 +56,71 @@ ccl_device_inline void microfacet_beckmann_sample_slopes(
 	/* precomputations */
 	const float tan_theta_i = sin_theta_i/cos_theta_i;
 	const float inv_a = tan_theta_i;
-	const float a = 1.0f/inv_a;
-	const float erf_a = approx_erff(a);
-	const float exp_a2 = expf(-a*a);
+	const float cot_theta_i = 1.0f/tan_theta_i;
+	const float erf_a = fast_erff(cot_theta_i);
+	const float exp_a2 = expf(-cot_theta_i*cot_theta_i);
 	const float SQRT_PI_INV = 0.56418958354f;
 	const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a);
 	const float G1 = 1.0f/(1.0f + Lambda); /* masking */
 
 	*G1i = G1;
 
-#if 0
-	const float C = 1.0f - G1 * erf_a;
-
-	/* sample slope X */
-	if(randu < C) {
-		/* rescale randu */
-		randu = randu / C;
-		const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2;
-		const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a);
-		const float p = w_1 / (w_1 + w_2);
-
-		if(randu < p) {
-			randu = randu / p;
-			*slope_x = -sqrtf(-logf(randu*exp_a2));
-		}
-		else {
-			randu = (randu - p) / (1.0f - p);
-			*slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a);
-		}
+#if defined(__KERNEL_GPU__)
+	/* Based on paper from Wenzel Jakob
+	 * An Improved Visible Normal Sampling Routine for the Beckmann Distribution
+	 *
+	 * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
+	 *
+	 * Reformulation from OpenShadingLanguage which avoids using inverse
+	 * trigonometric functions.
+	 */
+
+	/* Sample slope X.
+	 *
+	 * Compute a coarse approximation using the approximation:
+	 *   exp(-ierf(x)^2) ~= 1 - x * x
+	 *   solve y = 1 + b + K * (1 - b * b)
+	 */
+	float K = tan_theta_i * SQRT_PI_INV;
+	float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
+	float y_exact  = randu * (1.0f + erf_a + K * exp_a2);
+	float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
+
+	/* Perform newton step to refine toward the true root. */
+	float inv_erf = fast_ierff(b);
+	float value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+	/* Check if we are close enough already,
+	 * this also avoids NaNs as we get close to the root.
+	 */
+	if(fabsf(value) > 1e-6f) {
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */
+		inv_erf = fast_ierff(b);
+		value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */
+		/* Compute the slope from the refined value. */
+		*slope_x = fast_ierff(b);
 	}
 	else {
-		/* rescale randu */
-		randu = (randu - C) / (1.0f - C);
-		*slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a);
-
-		const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i);
-
-		if(randv > p) {
-			*slope_x = -(*slope_x);
-			randv = (randv - p) / (1.0f - p);
-		}
-		else
-			randv = randv / p;
+		/* We are close enough already. */
+		*slope_x = inv_erf;
 	}
-
-	/* sample slope Y */
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #else
-	/* use precomputed table, because it better preserves stratification
-	 * of the random number pattern */
+	/* Use precomputed table on CPU, it gives better perfomance. */
 	int beckmann_table_offset = kernel_data.tables.beckmann_offset;
 
 	*slope_x = lookup_table_read_2D(kg, randu, cos_theta_i,
 		beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #endif
-
 }
 
+/* GGX microfacet importance sampling from:
+ *
+ * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
+ * E. Heitz and E. d'Eon, EGSR 2014
+ */
+
 ccl_device_inline void microfacet_ggx_sample_slopes(
 	const float cos_theta_i, const float sin_theta_i,
 	float randu, float randv, float *slope_x, float *slope_y,
@@ -366,32 +235,32 @@ ccl_device_inline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
@@ -404,11 +273,11 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -487,7 +356,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -495,17 +364,17 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
 	float m_eta = sc->data2;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0); /* vectors on same side -- not possible */
+		return make_float3(0.0f, 0.0f, 0.0f); /* vectors on same side -- not possible */
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
@@ -513,10 +382,6 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
-	/* those situations makes chi+ terms in eq. 33, 34 be zero */
-	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
 	float D, G1o, G1i;
 
 	/* eq. 33: first we calculate D(m) with m=Ht: */
@@ -543,7 +408,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
 	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
 	float out = G * fabsf(cosHI * cosHO) * common;
-	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+	*pdf = G1o * fabsf(cosHO * cosHI) * common;
 
 	return make_float3(out, out, out);
 }
@@ -552,7 +417,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
@@ -657,16 +522,16 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2;
+			float m_eta = sc->data2, fresnel;
 			bool inside;
 
-			fresnel_dielectric(m_eta, m, I, &R, &T,
+			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 				&inside);
 			
-			if(!inside) {
+			if(!inside && fresnel != 1.0f) {
 
 				*omega_in = T;
 #ifdef __RAY_DIFFERENTIALS__
@@ -719,29 +584,29 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
@@ -754,11 +619,11 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -840,7 +705,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -848,17 +713,17 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
 	float m_eta = sc->data2;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
@@ -866,10 +731,6 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
-	/* those situations makes chi+ terms in eq. 25, 27 be zero */
-	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
 	/* eq. 25: first we calculate D(m) with m=Ht: */
 	float alpha2 = alpha_x * alpha_y;
 	float cosThetaM = min(dot(N, Ht), 1.0f);
@@ -895,7 +756,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
 	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
 	float out = G * fabsf(cosHI * cosHO) * common;
-	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+	*pdf = G1o * fabsf(cosHO * cosHI) * common;
 
 	return make_float3(out, out, out);
 }
@@ -904,7 +765,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
@@ -1011,16 +872,16 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2;
+			float m_eta = sc->data2, fresnel;
 			bool inside;
 
-			fresnel_dielectric(m_eta, m, I, &R, &T,
+			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 				&inside);
 
-			if(!inside) {
+			if(!inside && fresnel != 1.0f) {
 				*omega_in = T;
 
 #ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 6f685d5eeea..61b7cb11b02 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BSDF_OREN_NAYAR_H__
@@ -25,7 +25,7 @@ ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3
 	float nv = max(dot(n, v), 0.0f);
 	float t = dot(l, v) - nl * nv;
 
-	if (t > 0.0f)
+	if(t > 0.0f)
 		t /= max(nl, nv) + FLT_MIN;
 	float is = nl * (sc->data0 + sc->data1 * t);
 	return make_float3(is, is, is);
@@ -37,23 +37,19 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 
 	sc->type = CLOSURE_BSDF_OREN_NAYAR_ID;
 
-	sigma = clamp(sigma, 0.0f, 1.0f);
+	sigma = saturate(sigma);
 
 	float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) * sigma);
 
 	sc->data0 = 1.0f * div;
 	sc->data1 = sigma * div;
 
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
-}
-
-ccl_device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
-{
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	if (dot(sc->N, omega_in) > 0.0f) {
+	if(dot(sc->N, omega_in) > 0.0f) {
 		*pdf = 0.5f * M_1_PI_F;
 		return bsdf_oren_nayar_get_intensity(sc, sc->N, I, omega_in);
 	}
@@ -72,7 +68,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 {
 	sample_uniform_hemisphere(sc->N, randu, randv, omega_in, pdf);
 
-	if (dot(Ng, *omega_in) > 0.0f) {
+	if(dot(Ng, *omega_in) > 0.0f) {
 		*eval = bsdf_oren_nayar_get_intensity(sc, sc->N, I, *omega_in);
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -86,7 +82,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 		*eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
-	return LABEL_REFLECT | LABEL_DIFFUSE;
+	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 2b4e1c68640..1ab15eee954 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -51,10 +51,10 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 
 ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 {
-	sc->data0 = max(sc->data0, 0.0f);
-	
 	sc->type = CLOSURE_BSDF_PHONG_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	sc->data0 = max(sc->data0, 0.0f);
+	sc->data1 = 0.0f;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
@@ -67,11 +67,11 @@ ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const fl
 	float cosNI = dot(sc->N, omega_in);
 	float cosNO = dot(sc->N, I);
 	
-	if (cosNI > 0 && cosNO > 0) {
+	if(cosNI > 0 && cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 		float cosRI = dot(R, omega_in);
-		if (cosRI > 0) {
+		if(cosRI > 0) {
 			float cosp = powf(cosRI, m_exponent);
 			float common = 0.5f * M_1_PI_F * cosp;
 			float out = cosNI * (m_exponent + 2) * common;
@@ -93,7 +93,7 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 	float cosNO = dot(sc->N, I);
 	float m_exponent = sc->data0;
 	
-	if (cosNO > 0) {
+	if(cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 
@@ -111,12 +111,12 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 		*omega_in = (cosf(phi) * sinTheta) * T +
 		            (sinf(phi) * sinTheta) * B +
 		            (            cosTheta) * R;
-		if (dot(Ng, *omega_in) > 0.0f)
+		if(dot(Ng, *omega_in) > 0.0f)
 		{
 			// common terms for pdf and eval
 			float cosNI = dot(sc->N, *omega_in);
 			// make sure the direction we chose is still in the right hemisphere
-			if (cosNI > 0)
+			if(cosNI > 0)
 			{
 				float cosp = powf(cosTheta, m_exponent);
 				float common = 0.5f * M_1_PI_F * cosp;
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index 0baccdf155c..303f4c9ce34 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_reflection_setup(ShaderClosure *sc)
 	return SD_BSDF;
 }
 
-ccl_device void bsdf_reflection_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -70,8 +66,9 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3
 			*domega_in_dx = 2 * dot(N, dIdx) * N - dIdx;
 			*domega_in_dy = 2 * dot(N, dIdy) * N - dIdy;
 #endif
-			*pdf = 1;
-			*eval = make_float3(1, 1, 1);
+			/* Some high number for MIS. */
+			*pdf = 1e6f;
+			*eval = make_float3(1e6f, 1e6f, 1e6f);
 		}
 	}
 	return LABEL_REFLECT|LABEL_SINGULAR;
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index c4698b42060..c78a4b67134 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_refraction_setup(ShaderClosure *sc)
 	return SD_BSDF;
 }
 
-ccl_device void bsdf_refraction_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_refraction_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -67,15 +63,17 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3
 	float3 dRdx, dRdy, dTdx, dTdy;
 #endif
 	bool inside;
-	fresnel_dielectric(m_eta, N, I, &R, &T,
+	float fresnel;
+	fresnel = fresnel_dielectric(m_eta, N, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 		dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 		&inside);
-	
-	if(!inside) {
-		*pdf = 1.0f;
-		*eval = make_float3(1.0f, 1.0f, 1.0f);
+
+	if(!inside && fresnel != 1.0f) {
+		/* Some high number for MIS. */
+		*pdf = 1e6f;
+		*eval = make_float3(1e6f, 1e6f, 1e6f);
 		*omega_in = T;
 #ifdef __RAY_DIFFERENTIALS__
 		*domega_in_dx = dTdx;
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 797fa4227ae..e5b6ab93a64 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -40,16 +40,12 @@ CCL_NAMESPACE_BEGIN
 ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_diffuse_toon_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
 	float is;
@@ -124,16 +120,12 @@ ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, floa
 ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_glossy_toon_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 73601d20c3a..3c2fd8004df 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -41,10 +41,6 @@ ccl_device int bsdf_transparent_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_TRANSPARENT;
 }
 
-ccl_device void bsdf_transparent_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
deleted file mode 100644
index 9dc1c00bb3d..00000000000
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WESTIN_H__
-#define __BSDF_WESTIN_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WESTIN BACKSCATTER */
-
-ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
-{
-	float roughness = sc->data0;
-	roughness = clamp(roughness, 1e-5f, 1.0f);
-	float m_invroughness = 1.0f/roughness;
-
-	sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID;
-	sc->data0 = m_invroughness;
-
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
-{
-	float m_invroughness = sc->data0;
-	m_invroughness = min(1.0f/roughness, m_invroughness);
-	sc->data0 = m_invroughness;
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float cosine = dot(I, omega_in);
-		*pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0;
-		*pdf *= 0.5f * M_1_PI_F;
-		return make_float3 (*pdf, *pdf, *pdf);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	float cosNO = dot(N, I);
-	if(cosNO > 0) {
-#ifdef __RAY_DIFFERENTIALS__
-		*domega_in_dx = dIdx;
-		*domega_in_dy = dIdy;
-#endif
-		float3 T, B;
-		make_orthonormals (I, &T, &B);
-		float phi = M_2PI_F * randu;
-		float cosTheta = powf(randv, 1 / (m_invroughness + 1));
-		float sinTheta2 = 1 - cosTheta * cosTheta;
-		float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
-		*omega_in = (cosf(phi) * sinTheta) * T +
-		            (sinf(phi) * sinTheta) * B +
-		            (cosTheta) * I;
-		if(dot(Ng, *omega_in) > 0) {
-			// common terms for pdf and eval
-			float cosNI = dot(N, *omega_in);
-			// make sure the direction we chose is still in the right hemisphere
-			if(cosNI > 0)
-			{
-				*pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness);
-				*pdf = (m_invroughness + 1) * (*pdf);
-				*eval = make_float3(*pdf, *pdf, *pdf);
-			}
-		}
-	}
-	return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-/* WESTIN SHEEN */
-
-ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc)
-{
-	/* float edginess = sc->data0; */
-	sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float sinNO2 = 1 - cosNO * cosNO;
-		*pdf = cosNI * M_1_PI_F;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		return make_float3 (westin, westin, westin);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// we are viewing the surface from the right side - send a ray out with cosine
-	// distribution over the hemisphere
-	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
-	if(dot(Ng, *omega_in) > 0) {
-		// TODO: account for sheen when sampling
-		float cosNO = dot(N, I);
-		float sinNO2 = 1 - cosNO * cosNO;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		*eval = make_float3(westin, westin, westin);
-#ifdef __RAY_DIFFERENTIALS__
-		// TODO: find a better approximation for the diffuse bounce
-		*domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-		*domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
-	}
-	else {
-		pdf = 0;
-	}
-	return LABEL_REFLECT|LABEL_DIFFUSE;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WESTIN_H__ */
-
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 3849dedc3b6..f817dcd5f2d 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_BSSRDF_H__
@@ -30,8 +30,8 @@ ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 		return flag;
 	}
 	else {
-		sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* texture blur */
-		sc->T.x = clamp(sc->T.x, 0.0f, 1.0f); /* sharpness */
+		sc->data1 = saturate(sc->data1); /* texture blur */
+		sc->T.x = saturate(sc->T.x); /* sharpness */
 		sc->type = type;
 
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
@@ -157,7 +157,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 	float x = 0.25f;
 	int i;
 
-	for (i = 0; i < max_iteration_count; i++) {
+	for(i = 0; i < max_iteration_count; i++) {
 		float x2 = x*x;
 		float x3 = x2*x;
 		float nx = (1.0f - x);
@@ -168,7 +168,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 		if(fabsf(f) < tolerance || f_ == 0.0f)
 			break;
 
-		x = clamp(x - f/f_, 0.0f, 1.0f);
+		x = saturate(x - f/f_);
 	}
 
 	return x;
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 058c4b8408f..4d71ba50ec3 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __VOLUME_H__
@@ -26,9 +26,6 @@ CCL_NAMESPACE_BEGIN
  * uniform sphere. g=0 uniform diffuse-like, g=1 close to sharp single ray. */
 ccl_device float single_peaked_henyey_greenstein(float cos_theta, float g)
 {
-	if(fabsf(g) < 1e-3f)
-		return M_1_PI_F * 0.25f;
-	
 	return ((1.0f - g * g) / safe_powf(1.0f + g * g - 2.0f * g * cos_theta, 1.5f)) * (M_1_PI_F * 0.25f);
 };
 
@@ -39,7 +36,7 @@ ccl_device int volume_henyey_greenstein_setup(ShaderClosure *sc)
 	/* clamp anisotropy to avoid delta function */
 	sc->data0 = signf(sc->data0) * min(fabsf(sc->data0), 1.0f - 1e-3f);
 
-	return SD_SCATTER|SD_PHASE_HAS_EVAL;
+	return SD_SCATTER;
 }
 
 ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, const float3 I, float3 omega_in, float *pdf)
@@ -47,9 +44,13 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, c
 	float g = sc->data0;
 
 	/* note that I points towards the viewer */
-	float cos_theta = dot(-I, omega_in);
-
-	*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+	if(fabsf(g) < 1e-3f) {
+		*pdf = M_1_PI_F * 0.25f;
+	}
+	else {
+		float cos_theta = dot(-I, omega_in);
+		*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+	}
 
 	return make_float3(*pdf, *pdf, *pdf);
 }
@@ -63,10 +64,12 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I
 	/* match pdf for small g */
 	if(fabsf(g) < 1e-3f) {
 		cos_theta = (1.0f - 2.0f * randu);
+		*pdf = M_1_PI_F * 0.25f;
 	}
 	else {
 		float k = (1.0f - g * g) / (1.0f - g + 2.0f * g * randu);
 		cos_theta = (1.0f + g * g - k * k) / (2.0f * g);
+		*pdf = single_peaked_henyey_greenstein(cos_theta, g);
 	}
 
 	float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
@@ -80,7 +83,6 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I
 	make_orthonormals(-I, &T, &B);
 	*omega_in = sin_theta * cos_phi * T + sin_theta * sin_phi * B + cos_theta * (-I);
 
-	*pdf = single_peaked_henyey_greenstein(cos_theta, g);
 	*eval = make_float3(*pdf, *pdf, *pdf); /* perfect importance sampling */
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -105,18 +107,9 @@ ccl_device int volume_absorption_setup(ShaderClosure *sc)
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf)
 {
-	float3 eval;
-
-	switch(sc->type) {
-		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-			break;
-		default:
-			eval = make_float3(0.0f, 0.0f, 0.0f);
-			break;
-	}
+	kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
 
-	return eval;
+	return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd, const ShaderClosure *sc, float randu,
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 9495a2541f9..5ab900d47aa 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -20,7 +20,11 @@
 
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
+#define BVH_QSTACK_SIZE 384
 #define BVH_NODE_SIZE 4
+#define BVH_NODE_LEAF_SIZE 1
+#define BVH_QNODE_SIZE 7
+#define BVH_QNODE_LEAF_SIZE 1
 #define TRI_NODE_SIZE 3
 
 /* silly workaround for float extended precision that happens when compiling
@@ -35,6 +39,7 @@
 #include "geom_attribute.h"
 #include "geom_object.h"
 #include "geom_triangle.h"
+#include "geom_triangle_intersect.h"
 #include "geom_motion_triangle.h"
 #include "geom_motion_curve.h"
 #include "geom_curve.h"
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 63ce31c492f..c7364e9edac 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -29,24 +29,27 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
 {
-	if(sd->object == PRIM_NONE)
+	if(ccl_fetch(sd, object) == PRIM_NONE)
 		return (int)ATTR_STD_NOT_FOUND;
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-	attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+	attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
 	while(attr_map.x != id) {
+		if(UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
+			return ATTR_STD_NOT_FOUND;
+		}
 		attr_offset += ATTR_PRIM_TYPES;
 		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	}
 
 	*elem = (AttributeElement)attr_map.y;
 	
-	if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+	if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
 		return ATTR_STD_NOT_FOUND;
 
 	/* return result */
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index dd6abe32fec..3d0d406dd0b 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -28,6 +28,13 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+#define ccl_device_intersect ccl_device_noinline
+#else
+#define ccl_device_intersect ccl_device_inline
+#endif
+
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
@@ -35,6 +42,19 @@ CCL_NAMESPACE_BEGIN
 #define BVH_HAIR				4
 #define BVH_HAIR_MINIMUM_WIDTH	8
 
+#define BVH_NAME_JOIN(x,y) x ## _ ## y
+#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y)
+#define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME)
+
+#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+/* Common QBVH functions. */
+#ifdef __QBVH__
+#include "geom_qbvh.h"
+#endif
+
+/* Regular BVH traversal */
+
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_traversal.h"
@@ -63,6 +83,8 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_traversal.h"
 #endif
 
+/* Subsurface scattering BVH traversal */
+
 #if defined(__SUBSURFACE__)
 #define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #define BVH_FUNCTION_FEATURES 0
@@ -93,43 +115,108 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_subsurface.h"
 #endif
 
+/* Volume BVH traversal */
+
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+/* Record all intersections - Shadow BVH traversal */
+
 #if defined(__SHADOW_RECORD_ALL__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
+/* Record all intersections - Volume BVH traversal  */
+
+#if defined(__VOLUME_RECORD_ALL__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume_all.h"
 #endif
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume_all.h"
+#endif
+
+#undef BVH_FEATURE
+#undef BVH_NAME_JOIN
+#undef BVH_NAME_EVAL
+#undef BVH_FUNCTION_FULL_NAME
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
 					 uint *lcg_state, float difl, float extmax)
 {
 #ifdef __OBJECT_MOTION__
@@ -167,14 +254,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I
 #endif /* __KERNEL_CPU__ */
 }
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -212,14 +293,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
 }
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SHADOW_RECORD_ALL__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -237,26 +312,87 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection
 		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
 #endif /* __HAIR__ */
 
-#ifdef __KERNEL_CPU__
-
 #ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
 #endif /* __INSTANCING__ */
 
 	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+                            const Ray *ray,
+                            Intersection *isect)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_hair_motion(kg, ray, isect);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_motion(kg, ray, isect);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_hair(kg, ray, isect);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_instancing(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume(kg, ray, isect);
 #else /* __KERNEL_CPU__ */
 
 #ifdef __INSTANCING__
-	return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume_instancing(kg, ray, isect);
 #else
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume(kg, ray, isect);
 #endif /* __INSTANCING__ */
 
 #endif /* __KERNEL_CPU__ */
 }
 #endif
 
+#ifdef __VOLUME_RECORD_ALL__
+ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_all_hair_motion(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_all_hair(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume_all(kg, ray, isect, max_hits);
+}
+#endif
+
 
 /* Ray offset to avoid self intersection.
  *
@@ -311,5 +447,21 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index aee4097d77e..e4cba99dc96 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_shadow.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -27,10 +31,11 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device bool BVH_FUNCTION_NAME
-(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, uint *num_hits)
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits,
+                                            uint *num_hits)
 {
 	/* todo:
 	 * - likely and unlikely for if() statements
@@ -53,11 +58,11 @@ ccl_device bool BVH_FUNCTION_NAME
 	int object = OBJECT_NONE;
 	float isect_t = tmax;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
 
@@ -81,6 +86,9 @@ ccl_device bool BVH_FUNCTION_NAME
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do {
@@ -174,6 +182,7 @@ ccl_device bool BVH_FUNCTION_NAME
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -191,13 +200,15 @@ ccl_device bool BVH_FUNCTION_NAME
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
@@ -205,25 +216,26 @@ ccl_device bool BVH_FUNCTION_NAME
 
 					/* primitive intersection */
 					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
 						bool hit;
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
 
 						/* todo: specialized intersect functions which don't fill in
 						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
 						 * might give a few % performance improvement */
 
-						switch(type & PRIMITIVE_ALL) {
+						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 							case PRIMITIVE_MOTION_TRIANGLE: {
 								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
 #endif
-#if FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
@@ -252,7 +264,7 @@ ccl_device bool BVH_FUNCTION_NAME
 							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
 							{
-								shader =  kernel_tex_fetch(__tri_shader, prim);
+								shader = kernel_tex_fetch(__tri_shader, prim);
 							}
 #ifdef __HAIR__
 							else {
@@ -274,7 +286,7 @@ ccl_device bool BVH_FUNCTION_NAME
 							/* move on to next entry in intersections array */
 							isect_array++;
 							(*num_hits)++;
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
 
@@ -284,52 +296,55 @@ ccl_device bool BVH_FUNCTION_NAME
 						primAddr++;
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
+					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
 
 #if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
 					Psplat[1] = ssef(P.y);
 					Psplat[2] = ssef(P.z);
 
-					isect_array->t = isect_t;
 					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			if(num_hits_in_instance) {
 				float t_fac;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
 #else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #endif
 
+				triangle_intersect_precalc(dir, &isect_precalc);
+
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++)
 					(isect_array-i-1)->t *= t_fac;
@@ -337,22 +352,23 @@ ccl_device bool BVH_FUNCTION_NAME
 			else {
 				float ignore_t = FLT_MAX;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
 #else
 				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #endif
+				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
 			Psplat[2] = ssef(P.z);
 
-			isect_t = tmax;
-			isect_array->t = isect_t;
 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
@@ -360,13 +376,37 @@ ccl_device bool BVH_FUNCTION_NAME
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return false;
 }
 
-#undef FEATURE
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits,
+                                         uint *num_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits,
+		                                    num_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits,
+		                                   num_hits);
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a8f57cffa78..a73139f9c88 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_subsurface.h"
+#endif
+
 /* This is a template BVH traversal function for subsurface scattering, where
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
@@ -26,10 +30,12 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
-	int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            int subsurface_object,
+                                            uint *lcg_state,
+                                            int max_hits)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
@@ -54,10 +60,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	int object = OBJECT_NONE;
 	float isect_t = ray->t;
 
-	const uint visibility = PATH_RAY_ALL_VISIBILITY;
 	uint num_hits = 0;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
@@ -78,6 +83,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do
@@ -118,14 +126,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
 
 				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
 				traverseChild0 = (c0max >= c0min);
 				traverseChild1 = (c1max >= c1min);
-#endif
 
 #else // __KERNEL_SSE2__
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
@@ -145,14 +147,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#else
 				traverseChild0 = (movemask(lrhit) & 1);
 				traverseChild1 = (movemask(lrhit) & 2);
-#endif
 #endif // __KERNEL_SSE2__
 
 				nodeAddr = __float_as_int(cnodes.x);
@@ -173,6 +169,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -190,57 +187,64 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
 					--stackPtr;
 
 					/* primitive intersection */
-					for(; primAddr < primAddr2; primAddr++) {
-						/* only primitives from the same object */
-						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-
-						if(tri_object != subsurface_object)
-							continue;
-
-						/* intersect ray against primitive */
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
-
-						switch(type & PRIMITIVE_ALL) {
-							case PRIMITIVE_TRIANGLE: {
-								triangle_intersect_subsurface(kg, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
-								break;
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from the same object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object)
+									continue;
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
 							}
-#if FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from the same object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object)
+									continue;
 								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
-								break;
 							}
+							break;
+						}
 #endif
-							default: {
-								break;
-							}
+						default: {
+							break;
 						}
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
 						object = subsurface_object;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
+						triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
@@ -253,6 +257,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
 						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 						nodeAddr = kernel_tex_fetch(__object_node, object);
@@ -264,20 +269,22 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 					}
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
+			triangle_intersect_precalc(dir, &isect_precalc);
+
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
@@ -292,13 +299,40 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return num_hits;
 }
 
-#undef FEATURE
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         int subsurface_object,
+                                         uint *lcg_state,
+                                         int max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    subsurface_object,
+		                                    lcg_state,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   subsurface_object,
+		                                   lcg_state,
+		                                   max_hits);
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index e39228c33de..73d79fd78ee 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_traversal.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -28,14 +32,16 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device bool BVH_FUNCTION_NAME
-(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-, uint *lcg_state, float difl, float extmax
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect,
+                                            const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                            , uint *lcg_state,
+                                            float difl,
+                                            float extmax
 #endif
-)
+                                            )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
@@ -58,15 +64,20 @@ ccl_device bool BVH_FUNCTION_NAME
 	float3 idir = bvh_inverse_direction(dir);
 	int object = OBJECT_NONE;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
 	isect->t = ray->t;
-	isect->object = OBJECT_NONE;
-	isect->prim = PRIM_NONE;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
+#endif
 
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
@@ -85,6 +96,9 @@ ccl_device bool BVH_FUNCTION_NAME
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do {
@@ -122,7 +136,7 @@ ccl_device bool BVH_FUNCTION_NAME
 				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
 				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					float hdiff = 1.0f + difl;
 					float ldiff = 1.0f - difl;
@@ -163,7 +177,7 @@ ccl_device bool BVH_FUNCTION_NAME
 				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
 				const ssef tminmax = minmax ^ pn;
 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					float4 *tminmaxview = (float4*)&tminmax;
 					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
@@ -213,6 +227,7 @@ ccl_device bool BVH_FUNCTION_NAME
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -226,80 +241,112 @@ ccl_device bool BVH_FUNCTION_NAME
 						--stackPtr;
 					}
 				}
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
 			}
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
 					--stackPtr;
 
 					/* primitive intersection */
-					while(primAddr < primAddr2) {
-						bool hit;
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
-
-						switch(type & PRIMITIVE_ALL) {
-							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
-								break;
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#endif
+								}
 							}
-#if FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
-								break;
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#endif
+								}
 							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
 #endif
-#if FEATURE(BVH_HAIR)
-							case PRIMITIVE_CURVE:
-							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
 									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
 								else
 									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								break;
-							}
-#endif
-							default: {
-								hit = false;
-								break;
-							}
-						}
-
-						/* shadow ray early termination */
+								if(hit) {
+									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
-						if(hit) {
-							if(visibility == PATH_RAY_SHADOW_OPAQUE)
-								return true;
-
-							tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-						}
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #else
-						if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
-							return true;
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
 #endif
-
-						primAddr++;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
 					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
+					triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -312,24 +359,30 @@ ccl_device bool BVH_FUNCTION_NAME
 #endif
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
+			triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
@@ -345,13 +398,52 @@ ccl_device bool BVH_FUNCTION_NAME
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return (isect->prim != PRIM_NONE);
 }
 
-#undef FEATURE
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect,
+                                         const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                         , uint *lcg_state,
+                                         float difl,
+                                         float extmax
+#endif
+                                         )
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect,
+		                                    visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                    , lcg_state,
+		                                    difl,
+		                                    extmax
+#endif
+		                                    );
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect,
+		                                   visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                   , lcg_state,
+		                                   difl,
+		                                   extmax
+#endif
+		                                   );
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
new file mode 100644
index 00000000000..41c784869f2
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -0,0 +1,358 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#include "geom_qbvh_volume.h"
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* instance pop */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
new file mode 100644
index 00000000000..b6db36f4b17
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
@@ -0,0 +1,454 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#include "geom_qbvh_volume_all.h"
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_array->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					bool hit;
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index c4e9e2ababe..9653ad8f1bb 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
+	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
 }
 
 /* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
+		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
 #endif
 	}
 
@@ -442,12 +442,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		float r_ext = mw_extension + r_curr;
 		float coverage = 1.0f;
 
-		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
 			/* the bounding box does not overlap the square centered at O */
 			tree += level;
 			level = tree & -tree;
 		}
-		else if (level == 1) {
+		else if(level == 1) {
 
 			/* the maximum recursion depth is reached.
 			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
@@ -459,13 +459,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 			if(flags & CURVE_KN_RIBBONS) {
 				float3 tg = (p_en - p_st);
 				float w = tg.x * tg.x + tg.y * tg.y;
-				if (w == 0) {
+				if(w == 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
@@ -474,17 +474,17 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
-				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
-				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -500,13 +500,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					float d0 = d - r_curr;
 					float d1 = d + r_curr;
 					float inv_mw_extension = 1.0f/mw_extension;
-					if (d0 >= 0)
+					if(d0 >= 0)
 						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
 					else // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
 				
-				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -548,7 +548,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
 				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
 				float td = tb*tb - 4*cyla*tc;
-				if (td < 0.0f) {
+				if(td < 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -559,10 +559,10 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				t = tcentre + correction;
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
 
 				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
@@ -570,14 +570,14 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					t = tcentre + correction;
 				}			
 
-				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 
 				float w = (zcentre + (tg.z * correction)) * invl;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
 
@@ -600,12 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = u;
-				isect->v = gd;
-				isect->t = t;
 				hit = true;
 			}
 			
@@ -646,8 +646,8 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float4 P_curve[2];
 
 	if(type & PRIMITIVE_CURVE) {
-		P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
 	else {
 		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
@@ -709,7 +709,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
 	const ssef dir = load4f(direction);
 	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
 #endif
 
 	float mr = max(r1, r2);
@@ -777,7 +777,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
 	float td = tb*tb - 4*a*tc;
 
-	if (td < 0.0f)
+	if(td < 0.0f)
 		return false;
 
 	float rootd = 0.0f;
@@ -818,7 +818,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 
 		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
 
-			if (flags & CURVE_KN_ENCLOSEFILTER) {
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
 				float enc_ratio = 1.01f;
 				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
 					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
@@ -835,12 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->t = t;
 
 				return true;
 			}
@@ -890,7 +890,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -903,7 +903,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -914,14 +914,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -933,43 +933,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = isect->u;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
+			ccl_fetch(sd, Ng) = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 	else {
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -980,39 +980,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = dot(dif,tg)/l;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
-		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
+			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
 
 			/* adjustment for changing radius */
-			if (gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+			if(gd != 0.0f) {
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
+	ccl_fetch(sd, dPdu) = tg;
+	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 1022a957b05..6de5aa7ea99 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -27,17 +27,22 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem)
 {
-	/* todo: find a better (faster) solution for this, maybe store offset per object */
+	/* todo: find a better (faster) solution for this, maybe store offset per object.
+	 *
+	 * NOTE: currently it's not a bottleneck because in test scenes the loop below runs
+	 * zero iterations and rendering is really slow with motion curves. For until other
+	 * areas are speed up it's probably not so crucial to optimize this out.
+	 */
 	uint attr_offset = object*kernel_data.bvh.attributes_map_stride + ATTR_PRIM_CURVE;
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
-	
+
 	while(attr_map.x != id) {
 		attr_offset += ATTR_PRIM_TYPES;
 		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	}
 
 	*elem = (AttributeElement)attr_map.y;
-	
+
 	/* return result */
 	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index b275b89a8a4..86f93f242a1 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -130,8 +130,11 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -158,7 +161,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -184,7 +187,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -210,7 +213,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -233,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
 	/* get shader */
-	sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 	/* get motion info */
 	int numsteps, numverts;
-	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
 
 	/* figure out which steps we need to fetch and their interpolation factor */
 	int maxstep = numsteps*2;
-	int step = min((int)(sd->time*maxstep), maxstep-1);
-	float t = sd->time*maxstep - step;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;
 
 	/* find attribute */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 	/* fetch vertex coordinates */
 	float3 verts[3], next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
 
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -265,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #ifdef __SUBSURFACE__
 	if(!subsurface)
 #endif
-		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
 #ifdef __SUBSURFACE__
 	else
-		sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
 #endif
 
 	/* compute face normal */
 	float3 Ng;
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	else
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
-	sd->Ng = Ng;
-	sd->N = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;
 
 	/* compute derivatives of P w.r.t. uv */
 #ifdef __DPDU__
-	sd->dPdu = (verts[0] - verts[2]);
-	sd->dPdv = (verts[1] - verts[2]);
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
 #endif
 
 	/* compute smooth normal */
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 		/* find attribute */
 		AttributeElement elem;
-		int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 		/* fetch vertex coordinates */
@@ -305,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 
 		/* interpolate between vertices */
-		float u = sd->u;
-		float v = sd->v;
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
 		float w = 1.0f - u - v;
-		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
@@ -336,12 +339,12 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
 		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
 #endif
 		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
 			isect->prim = triAddr;
 			isect->object = object;
 			isect->type = PRIMITIVE_MOTION_TRIANGLE;
-			isect->u = u;
-			isect->v = v;
-			isect->t = t;
 		
 			return true;
 		}
@@ -388,12 +391,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
 
 		/* record intersection */
 		Intersection *isect = &isect_array[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
 		isect->prim = triAddr;
 		isect->object = object;
 		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		isect->u = u;
-		isect->v = v;
-		isect->t = t;
 	}
 }
 #endif
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 91edd5863ac..9d0a008fff1 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_tfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_itfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_tfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_itfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(sd->object == OBJECT_NONE)
+	if(ccl_fetch(sd, object) == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -243,7 +243,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 {
 	if(object == OBJECT_NONE)
-		return 0.0f;
+		return 0;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
 	float4 f = kernel_tex_fetch(__objects, offset);
@@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -391,9 +391,41 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float *t,
+                                          float *t1)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+	*P = transform_point(&tfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -421,7 +453,7 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	Transform itfm;
 	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@@ -436,9 +468,36 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, float *t1, Transform *tfm)
+{
+	Transform itfm;
+	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
+
+	*P = transform_point(&itfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	if(*t != FLT_MAX)
 		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@@ -461,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj
 
 #endif
 
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+                                                      const ShaderData *sd,
+                                                      ccl_addr_space float3 *D)
+{
+	float3 private_D = *D;
+	object_dir_transform(kg, sd, &private_D);
+	*D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+                                                         const ShaderData *sd,
+                                                         ccl_addr_space float3 *N)
+{
+	float3 private_N = *N;
+	object_normal_transform(kg, sd, &private_N);
+	*N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+#  define object_dir_transform_auto object_dir_transform
+#  define object_normal_transform_auto object_normal_transform
+#else
+#  define object_dir_transform_auto object_dir_transform_addrspace
+#  define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 5df6c75df86..30f12d32355 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Primitive Utilities
@@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
 
 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE)
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(sd->N, normalize(cross(data, sd->N)));
+		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -144,15 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 center;
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED))
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
 			object_position_transform(kg, sd, &center);
 	}
 	else
 #endif
-		center = sd->P;
+		center = ccl_fetch(sd, P);
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -163,30 +164,37 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	if(offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
-		int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+		int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
 
 		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
 		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
+
+#ifdef __HAIR__
+		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+			object_position_transform(kg, sd, &motion_pre);
+			object_position_transform(kg, sd, &motion_post);
+		}
+#endif
 	}
 
 	/* object motion. note that depending on the mesh having motion vectors, this
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
 
 	/* camera motion, for perspective/orthographic motion.pre/post will be a
 	 * world-to-raster matrix, for panorama it's world-to-camera */
-	if (kernel_data.cam.type != CAMERA_PANORAMA) {
+	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		tfm = kernel_data.cam.worldtoraster;
 		motion_center = transform_perspective(&tfm, center);
 
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
new file mode 100644
index 00000000000..37deaac0800
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct QBVHStackItem {
+	int addr;
+	float dist;
+};
+
+/* TOOD(sergey): Investigate if using instrinsics helps for both
+ * stack item swap and float comparison.
+ */
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
+                                      QBVHStackItem *__restrict b)
+{
+	QBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3,
+                                       QBVHStackItem *__restrict s4)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
+                                          const ssef& tnear,
+                                          const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                          const sse3f& org_idir,
+#else
+                                          const sse3f& org,
+#endif
+                                          const sse3f& idir,
+                                          const int near_x,
+                                          const int near_y,
+                                          const int near_z,
+                                          const int far_x,
+                                          const int far_y,
+                                          const int far_z,
+                                          const int nodeAddr,
+                                          ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
+	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
+	const sseb vmask = cast(tNear) > cast(tFar);
+	int mask = (int)movemask(vmask)^0xf;
+#else
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = tNear <= tFar;
+	int mask = (int)movemask(vmask);
+#endif
+	*dist = tNear;
+	return mask;
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
+                                                 const ssef& tnear,
+                                                 const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                                 const sse3f& P_idir,
+#else
+                                                 const sse3f& P,
+#endif
+                                                 const sse3f& idir,
+                                                 const int near_x,
+                                                 const int near_y,
+                                                 const int near_z,
+                                                 const int far_x,
+                                                 const int far_y,
+                                                 const int far_z,
+                                                 const int nodeAddr,
+                                                 const float difl,
+                                                 ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return (int)movemask(vmask);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
new file mode 100644
index 00000000000..dc37e6ecfa4
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -0,0 +1,403 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(tmax);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
+						bool hit;
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(p_type) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* Shadow ray early termination. */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								shader = kernel_tex_fetch(__tri_shader, prim);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect_t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(tmax);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
new file mode 100644
index 00000000000..d85e1a4691e
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -0,0 +1,326 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             int subsurface_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - SSE for hair.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+	uint num_hits = 0;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return 0;
+	}
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* Intersect ray against primitive, */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from the same object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object) {
+									continue;
+								}
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* Intersect ray against primitive. */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from the same object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object) {
+									continue;
+								}
+								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+							}
+							break;
+						}
+#endif
+						default:
+							break;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
+						object = subsurface_object;
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
new file mode 100644
index 00000000000..7e356ea062b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -0,0 +1,425 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	traversalStack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+	float nodeDist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				if(UNLIKELY(nodeDist > isect->t)) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int traverseChild;
+				ssef dist;
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					traverseChild = qbvh_node_intersect_robust(kg,
+					                                           tnear,
+					                                           tfar,
+#ifdef __KERNEL_AVX2__
+					                                           P_idir4,
+#else
+					                                           org,
+#endif
+					                                           idir4,
+					                                           near_x, near_y, near_z,
+					                                           far_x, far_y, far_z,
+					                                           nodeAddr,
+					                                           difl,
+					                                           &dist);
+				}
+				else
+#endif
+				{
+					traverseChild = qbvh_node_intersect(kg,
+					                                    tnear,
+					                                    tfar,
+#ifdef __KERNEL_AVX2__
+					                                    P_idir4,
+#else
+					                                    org,
+#endif
+					                                    idir4,
+					                                    near_x, near_y, near_z,
+					                                    far_x, far_y, far_z,
+					                                    nodeAddr,
+					                                    &dist);
+				}
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					float d0 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						nodeDist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							nodeDist = d1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							nodeDist = d0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						nodeDist = traversalStack[stackPtr].dist;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				nodeDist = traversalStack[stackPtr].dist;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((nodeDist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								if(hit) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_tfm);
+#else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
+#endif
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect->t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+					traversalStack[stackPtr].dist = -FLT_MAX;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			nodeDist = traversalStack[stackPtr].dist;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
new file mode 100644
index 00000000000..d8cfa3a4061
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@@ -0,0 +1,351 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect->t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
new file mode 100644
index 00000000000..056ca9a1ad9
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
@@ -0,0 +1,446 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3d3a5e72485..995dfac5b09 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -23,111 +23,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* same as above, except that isect->t is assumed to be in object space for instancing */
-ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
 /* normal on triangle  */
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 	
 	/* return normal */
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		return normalize(cross(v2 - v0, v1 - v0));
 	else
 		return normalize(cross(v1 - v0, v2 - v0));
@@ -137,7 +44,7 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -164,7 +71,7 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int
 
 ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
 {
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -176,7 +83,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
@@ -187,10 +94,10 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -209,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
 		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
 		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -252,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float3 f0, f1, f2;
 
 		if(elem == ATTR_ELEMENT_CORNER) {
@@ -284,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -298,116 +205,4 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 	}
 }
 
-/* Ray-Triangle intersection for BVH traversal
- *
- * Based on Sven Woop's algorithm with precomputed triangle storage */
-
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int triAddr)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < isect->t) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-#ifdef __VISIBILITY_FLAG__
-				/* visibility flag test. we do it here under the assumption
-				 * that most triangles are culled by node flags */
-				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-				{
-					/* record intersection */
-					isect->prim = triAddr;
-					isect->object = object;
-					isect->type = PRIMITIVE_TRIANGLE;
-					isect->u = u;
-					isect->v = v;
-					isect->t = t;
-					return true;
-				}
-			}
-		}
-	}
-
-	return false;
-}
-
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-#ifdef __SUBSURFACE__
-ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
-	float3 P, float3 dir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < tmax) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-				(*num_hits)++;
-
-				int hit;
-
-				if(*num_hits <= max_hits) {
-					hit = *num_hits - 1;
-				}
-				else {
-					/* reservoir sampling: if we are at the maximum number of
-					 * hits, randomly replace element or skip it */
-					hit = lcg_step_uint(lcg_state) % *num_hits;
-
-					if(hit >= max_hits)
-						return;
-				}
-
-				/* record intersection */
-				Intersection *isect = &isect_array[hit];
-				isect->prim = triAddr;
-				isect->object = object;
-				isect->type = PRIMITIVE_TRIANGLE;
-				isect->u = u;
-				isect->v = v;
-				isect->t = t;
-			}
-		}
-	}
-}
-#endif
-
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
new file mode 100644
index 00000000000..ba309a1dc53
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Triangle/Ray intersections.
+ *
+ * For BVH ray intersection we use a precomputed triangle storage to accelerate
+ * intersection at the cost of more memory usage.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
+ * component of float3 value.
+ */
+#ifndef __KERNEL_CPU__
+#  define IDX(vec, idx) \
+    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
+#else
+#  define IDX(vec, idx) ((vec)[idx])
+#endif
+
+/* Ray-Triangle intersection for BVH traversal
+ *
+ * Sven Woop
+ * Watertight Ray/Triangle Intersection
+ *
+ * http://jcgt.org/published/0002/01/05/paper.pdf
+ */
+
+/* Precalculated data for the ray->tri intersection. */
+typedef struct IsectPrecalc {
+	/* Maximal dimension kz, and orthogonal dimensions. */
+	int kx, ky, kz;
+
+	/* Shear constants. */
+	float Sx, Sy, Sz;
+} IsectPrecalc;
+
+#if defined(__KERNEL_CUDA__)
+#  if (defined(i386) || defined(_M_IX86))
+#    if __CUDA_ARCH__ > 500
+ccl_device_noinline
+#    else  /* __CUDA_ARCH__ > 500 */
+ccl_device_inline
+#    endif  /* __CUDA_ARCH__ > 500 */
+#  else  /* (defined(i386) || defined(_M_IX86)) */
+#    if defined(__KERNEL_EXPERIMENTAL__) && (__CUDA_ARCH__ >= 500)
+ccl_device_noinline
+#    else
+ccl_device_inline
+#    endif
+#  endif  /* (defined(i386) || defined(_M_IX86)) */
+#elif defined(__KERNEL_OPENCL_APPLE__)
+ccl_device_noinline
+#else  /* defined(__KERNEL_OPENCL_APPLE__) */
+ccl_device_inline
+#endif  /* defined(__KERNEL_OPENCL_APPLE__) */
+void triangle_intersect_precalc(float3 dir,
+                                IsectPrecalc *isect_precalc)
+{
+	/* Calculate dimension where the ray direction is maximal. */
+	int kz = util_max_axis(make_float3(fabsf(dir.x),
+	                                   fabsf(dir.y),
+	                                   fabsf(dir.z)));
+	int kx = kz + 1; if(kx == 3) kx = 0;
+	int ky = kx + 1; if(ky == 3) ky = 0;
+
+	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
+	if(IDX(dir, kz) < 0.0f) {
+		int tmp = kx;
+		kx = ky;
+		ky = tmp;
+	}
+
+	/* Calculate the shear constants. */
+	float inv_dir_z = 1.0f / IDX(dir, kz);
+	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
+	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
+	isect_precalc->Sz = inv_dir_z;
+
+	/* Store the dimensions. */
+	isect_precalc->kx = kx;
+	isect_precalc->ky = ky;
+	isect_precalc->kz = kz;
+}
+
+/* TODO(sergey): Make it general utility function. */
+ccl_device_inline float xor_signmask(float x, int y)
+{
+	return __int_as_float(__float_as_int(x) ^ y);
+}
+
+ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+                                          const IsectPrecalc *isect_precalc,
+                                          Intersection *isect,
+                                          float3 P,
+                                          uint visibility,
+                                          int object,
+                                          int triAddr)
+{
+	const int kx = isect_precalc->kx;
+	const int ky = isect_precalc->ky;
+	const int kz = isect_precalc->kz;
+	const float Sx = isect_precalc->Sx;
+	const float Sy = isect_precalc->Sy;
+	const float Sz = isect_precalc->Sz;
+
+	/* Calculate vertices relative to ray origin. */
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
+
+	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
+	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
+	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
+
+	/* Perform shear and scale of vertices. */
+	const float Ax = A_kx - Sx * A_kz;
+	const float Ay = A_ky - Sy * A_kz;
+	const float Bx = B_kx - Sx * B_kz;
+	const float By = B_ky - Sy * B_kz;
+	const float Cx = C_kx - Sx * C_kz;
+	const float Cy = C_ky - Sy * C_kz;
+
+	/* Calculate scaled barycentric coordinates. */
+	float U = Cx * By - Cy * Bx;
+	float V = Ax * Cy - Ay * Cx;
+	float W = Bx * Ay - By * Ax;
+	const int sign_mask = (__float_as_int(U) & 0x80000000);
+	/* TODO(sergey): Check if multiplication plus sign check is faster
+	 * or at least same speed (but robust for endian types).
+	 */
+	if(sign_mask != (__float_as_int(V) & 0x80000000) ||
+	   sign_mask != (__float_as_int(W) & 0x80000000))
+	{
+		return false;
+	}
+
+	/* Calculate determinant. */
+	float det = U + V + W;
+	if(UNLIKELY(det == 0.0f)) {
+		return false;
+	}
+
+	/* Calculate scaled z-coordinates of vertices and use them to calculate
+	 * the hit distance.
+	 */
+	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+	const float sign_T = xor_signmask(T, sign_mask);
+	if((sign_T < 0.0f) ||
+	   (sign_T > isect->t * xor_signmask(det, sign_mask)))
+	{
+		return false;
+	}
+
+#ifdef __VISIBILITY_FLAG__
+	/* visibility flag test. we do it here under the assumption
+	 * that most triangles are culled by node flags */
+	if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+	{
+#ifdef __KERNEL_GPU__
+		float4 a = tri_b - tri_a, b = tri_c - tri_a;
+		if(len_squared(make_float3(a.y*b.z - a.z*b.y,
+		                           a.z*b.x - a.x*b.z,
+		                           a.x*b.y - a.y*b.x)) == 0.0f)
+		{
+			return false;
+		}
+#endif
+
+		/* Normalize U, V, W, and T. */
+		const float inv_det = 1.0f / det;
+		isect->prim = triAddr;
+		isect->object = object;
+		isect->type = PRIMITIVE_TRIANGLE;
+		isect->u = U * inv_det;
+		isect->v = V * inv_det;
+		isect->t = T * inv_det;
+		return true;
+	}
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point.
+ */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline void triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        const IsectPrecalc *isect_precalc,
+        Intersection *isect_array,
+        float3 P,
+        int object,
+        int triAddr,
+        float tmax,
+        uint *num_hits,
+        uint *lcg_state,
+        int max_hits)
+{
+	const int kx = isect_precalc->kx;
+	const int ky = isect_precalc->ky;
+	const int kz = isect_precalc->kz;
+	const float Sx = isect_precalc->Sx;
+	const float Sy = isect_precalc->Sy;
+	const float Sz = isect_precalc->Sz;
+
+	/* Calculate vertices relative to ray origin. */
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
+
+	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
+	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
+	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
+
+	/* Perform shear and scale of vertices. */
+	const float Ax = A_kx - Sx * A_kz;
+	const float Ay = A_ky - Sy * A_kz;
+	const float Bx = B_kx - Sx * B_kz;
+	const float By = B_ky - Sy * B_kz;
+	const float Cx = C_kx - Sx * C_kz;
+	const float Cy = C_ky - Sy * C_kz;
+
+	/* Calculate scaled barycentric coordinates. */
+	float U = Cx * By - Cy * Bx;
+	int sign_mask = (__float_as_int(U) & 0x80000000);
+	float V = Ax * Cy - Ay * Cx;
+	if(sign_mask != (__float_as_int(V) & 0x80000000)) {
+		return;
+	}
+	float W = Bx * Ay - By * Ax;
+	if(sign_mask != (__float_as_int(W) & 0x80000000)) {
+		return;
+	}
+
+	/* Calculate determinant. */
+	float det = U + V + W;
+	if(UNLIKELY(det == 0.0f)) {
+		return;
+	}
+
+	/* Calculate scaled z−coordinates of vertices and use them to calculate
+	 * the hit distance.
+	 */
+	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+	const float sign_T = xor_signmask(T, sign_mask);
+	if((sign_T < 0.0f) ||
+	   (sign_T > tmax * xor_signmask(det, sign_mask)))
+	{
+		return;
+	}
+
+	/* Normalize U, V, W, and T. */
+	const float inv_det = 1.0f / det;
+
+	(*num_hits)++;
+	int hit;
+
+	if(*num_hits <= max_hits) {
+		hit = *num_hits - 1;
+	}
+	else {
+		/* reservoir sampling: if we are at the maximum number of
+		 * hits, randomly replace element or skip it */
+		hit = lcg_step_uint(lcg_state) % *num_hits;
+
+		if(hit >= max_hits)
+			return;
+	}
+
+	/* record intersection */
+	Intersection *isect = &isect_array[hit];
+	isect->prim = triAddr;
+	isect->object = object;
+	isect->type = PRIMITIVE_TRIANGLE;
+	isect->u = U * inv_det;
+	isect->v = V * inv_det;
+	isect->t = T * inv_det;
+}
+#endif
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+/* Reintersections uses the paper:
+ *
+ * Tomas Moeller
+ * Fast, minimum storage ray/triangle intersection
+ * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
+ */
+
+ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         const Intersection *isect,
+                                         const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
+	float3 qvec = cross(tvec, edge1);
+	float3 pvec = cross(D, edge2);
+	float rt = dot(edge2, qvec) / dot(edge1, pvec);
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space for
+ * instancing.
+ */
+ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
+                                                    ShaderData *sd,
+                                                    const Intersection *isect,
+                                                    const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
+	float3 qvec = cross(tvec, edge1);
+	float3 pvec = cross(D, edge2);
+	float rt = dot(edge2, qvec) / dot(edge1, pvec);
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+#undef IDX
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 33a20494966..c72afa2a3a4 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Volume Primitive
@@ -52,11 +52,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = 0.0f;
-	if(dx) *dy = 0.0f;
+	if(dy) *dy = 0.0f;
 
 	/* todo: support float textures to lower memory usage for single floats */
 	return average(float4_to_float3(r));
@@ -68,7 +72,11 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 19e06b88797..b2596d10ee7 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_H__
@@ -32,7 +32,14 @@ void *kernel_osl_memory(KernelGlobals *kg);
 bool kernel_osl_use(KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation=INTERPOLATION_LINEAR);
+void kernel_tex_copy(KernelGlobals *kg,
+                     const char *name,
+                     device_ptr mem,
+                     size_t width,
+                     size_t height,
+                     size_t depth,
+                     InterpolationType interpolation=INTERPOLATION_LINEAR,
+                     ExtensionType extension = EXTENSION_REPEAT);
 
 void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b0efcdc66a7..2dc87fffcbc 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
 	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
@@ -341,12 +341,12 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 
 ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
 {
-	float3 L_sum, L_direct, L_indirect;
-	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
-	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
-
+	float3 L_sum;
 	/* Light Passes are used */
 #ifdef __PASSES__
+	float3 L_direct, L_indirect;
+	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
+	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
 	if(L->use_light_pass) {
 		path_radiance_sum_indirect(L);
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index a1ec080e3d3..2b305e5488d 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -11,11 +11,13 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
+#undef USE_BAKE_JITTER
+
 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
                                    const bool is_combined, const bool is_ao, const bool is_sss, int sample)
 {
@@ -29,6 +31,13 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	bool is_sss_sample = is_sss;
 
+	ray.P = sd->P + sd->Ng;
+	ray.D = -sd->Ng;
+	ray.t = FLT_MAX;
+#ifdef __CAMERA_MOTION__
+	ray.time = TIME_INVALID;
+#endif
+
 	/* init radiance */
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
@@ -55,7 +64,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 		/* sample subsurface scattering */
 		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+			if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
 				is_sss_sample = true;
 		}
 #endif
@@ -159,7 +168,8 @@ ccl_device bool is_light_pass(ShaderEvalType type)
 	}
 }
 
-#if 0
+/* this helps with AA but it's not the real solution as it does not AA the geometry
+ *  but it's better than nothing, thus committed */
 ccl_device_inline float bake_clamp_mirror_repeat(float u)
 {
 	/* use mirror repeat (like opengl texture) so that if the barycentric
@@ -170,7 +180,6 @@ ccl_device_inline float bake_clamp_mirror_repeat(float u)
 
 	return (((int)fu) & 1)? 1.0f - u: u;
 }
-#endif
 
 ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output,
                                      ShaderEvalType type, int i, int offset, int sample)
@@ -198,12 +207,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, 0);
+	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
 
-#if 0
-	uint rng_state = cmj_hash(i, 0);
+#ifdef USE_BAKE_JITTER
 	float filter_x, filter_y;
-	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+	if(sample == 0) {
+		filter_x = filter_y = 0.5f;
+	}
+	else {
+		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+	}
 
 	/* subpixel u/v offset */
 	if(sample > 0) {
@@ -253,6 +266,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
+			if((sd.flag & SD_HAS_BUMP)) {
+				shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			}
+
 			/* compression: normal = (2 * color) - 1 */
 			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
 			break;
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index d1217ae0abc..2d531fdc96e 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -41,11 +41,34 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
-	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
+	float3 raster = make_float3(raster_x, raster_y, 0.0f);
+	float3 Pcamera = transform_perspective(&rastertocamera, raster);
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.have_perspective_motion) {
+		/* TODO(sergey): Currently we interpolate projected coordinate which
+		 * gives nice looking result and which is simple, but is in fact a bit
+		 * different comparing to constructing projective matrix from an
+		 * interpolated field of view.
+		 */
+		if(ray->time < 0.5f) {
+			Transform rastertocamera_pre = kernel_data.cam.perspective_motion.pre;
+			float3 Pcamera_pre =
+			        transform_perspective(&rastertocamera_pre, raster);
+			Pcamera = interp(Pcamera_pre, Pcamera, ray->time * 2.0f);
+		}
+		else {
+			Transform rastertocamera_post = kernel_data.cam.perspective_motion.post;
+			float3 Pcamera_post =
+			        transform_perspective(&rastertocamera_post, raster);
+			Pcamera = interp(Pcamera, Pcamera_post, (ray->time - 0.5f) * 2.0f);
+		}
+	}
+#endif
 
 	ray->P = make_float3(0.0f, 0.0f, 0.0f);
 	ray->D = Pcamera;
@@ -70,8 +93,18 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             ((const DecompMotionTransform*)&tfm),
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             ((const DecompMotionTransform*)&kernel_data.cam.motion),
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -90,16 +123,17 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 
 #ifdef __CAMERA_CLIPPING__
 	/* clipping */
-	ray->P += kernel_data.cam.nearclip*ray->D;
-	ray->t = kernel_data.cam.cliplength;
+	float3 Pclip = normalize(Pcamera);
+	float z_inv = 1.0f / Pclip.z;
+	ray->P += kernel_data.cam.nearclip*ray->D * z_inv;
+	ray->t = kernel_data.cam.cliplength * z_inv;
 #else
 	ray->t = FLT_MAX;
 #endif
 }
 
 /* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -129,8 +163,18 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&tfm,
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -155,7 +199,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -203,8 +247,18 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&tfm,
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -215,18 +269,21 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 	/* ray differential */
 	ray->dP = differential3_zero();
 
+	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
+	float3 Ddiff = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y)));
+
 	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f));
-	ray->dD.dx = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - ray->D;
+	ray->dD.dx = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - Ddiff;
 
 	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
-	ray->dD.dy = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - ray->D;
+	ray->dD.dy = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - Ddiff;
 #endif
 }
 
 /* Common */
 
 ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-	float lens_u, float lens_v, float time, Ray *ray)
+	float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
 {
 	/* pixel filter */
 	int filter_table_offset = kernel_data.film.filter_table_offset;
@@ -303,7 +360,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -313,7 +370,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(sd->object != OBJECT_NONE)
+		if(ccl_fetch(sd, object) != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
@@ -325,4 +382,3 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index c2aab93c87b..ed145b4a967 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_CPU_H__
@@ -19,12 +19,39 @@
 
 #define __KERNEL_CPU__
 
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if defined(__GNUC__) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
 #include "util_debug.h"
 #include "util_math.h"
 #include "util_simd.h"
 #include "util_half.h"
 #include "util_types.h"
 
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
+     defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
+     (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
@@ -43,7 +70,7 @@ template<typename T> struct texture  {
 		return data[index];
 	}
 
-#if 0
+#ifdef __KERNEL_SSE2__
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
@@ -62,6 +89,14 @@ template<typename T> struct texture  {
 };
 
 template<typename T> struct texture_image  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
 	ccl_always_inline float4 read(float4 r)
 	{
 		return r;
@@ -93,7 +128,7 @@ template<typename T> struct texture_image  {
 		return x - (float)i;
 	}
 
-	ccl_always_inline float4 interp(float x, float y, bool periodic = true)
+	ccl_always_inline float4 interp(float x, float y)
 	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -103,34 +138,47 @@ template<typename T> struct texture_image  {
 		if(interpolation == INTERPOLATION_CLOSEST) {
 			frac(x*(float)width, &ix);
 			frac(y*(float)height, &iy);
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
 			}
 			return read(data[ix + iy*width]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-
-				nix = wrap_periodic(ix+1, width);
-				niy = wrap_periodic(iy+1, height);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-
-				nix = wrap_clamp(ix+1, width);
-				niy = wrap_clamp(iy+1, height);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
 			}
 
 			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
@@ -140,9 +188,79 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Bicubic b-spline interpolation. */
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			int pix, piy, nnix, nniy;
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			float u[4], v[4];
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y) (read(data[xc[x] + yc[y]]))
+#define TERM(col) \
+			(v[col] * (u[0] * DATA(0, col) + \
+			           u[1] * DATA(1, col) + \
+			           u[2] * DATA(2, col) + \
+			           u[3] * DATA(3, col)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+			/* Actual interpolation. */
+			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+
+#undef TERM
+#undef DATA
+		}
 	}
 
-	ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
+	ccl_always_inline float4 interp_3d(float x, float y, float z)
+	{
+		return interp_3d_ex(x, y, z, interpolation);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR)
 	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -154,41 +272,55 @@ template<typename T> struct texture_image  {
 			frac(y*(float)height, &iy);
 			frac(z*(float)depth, &iz);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-				iz = wrap_periodic(iz, depth);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-				iz = wrap_clamp(iz, depth);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
 			}
 
 			return read(data[ix + iy*width + iz*width*height]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 			float tz = frac(z*(float)depth - 0.5f, &iz);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-				iz = wrap_periodic(iz, depth);
-
-				nix = wrap_periodic(ix+1, width);
-				niy = wrap_periodic(iy+1, height);
-				niz = wrap_periodic(iz+1, depth);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-				iz = wrap_clamp(iz, depth);
-
-				nix = wrap_clamp(ix+1, width);
-				niy = wrap_clamp(iy+1, height);
-				niz = wrap_clamp(iz+1, depth);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
 			}
 
 			float4 r;
@@ -205,6 +337,92 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+					piz = wrap_periodic(iz-1, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					nniz = wrap_periodic(iz+2, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+					piz = wrap_clamp(iz-1, depth);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+					nniz = wrap_clamp(iz+2, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+		}
 	}
 
 	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
@@ -216,7 +434,9 @@ template<typename T> struct texture_image  {
 
 	T *data;
 	int interpolation;
+	ExtensionType extension;
 	int width, height, depth;
+#undef SET_CUBIC_SPLINE_WEIGHTS
 };
 
 typedef texture<float4> texture_float4;
@@ -237,9 +457,38 @@ typedef texture_image<uchar4> texture_image_uchar4;
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
 
 #define kernel_data (kg->__data)
 
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b& a)
+{
+	print_sseb(label, a.x);
+	print_sseb(label, a.y);
+	print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f& a)
+{
+	print_ssef(label, a.x);
+	print_ssef(label, a.y);
+	print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i& a)
+{
+	print_ssei(label, a.x);
+	print_ssei(label, a.y);
+	print_ssei(label, a.z);
+}
+
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e4c20d26ff1..9fdd3abfec3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_CUDA_H__
@@ -22,6 +22,14 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
 #include <cuda.h>
 #include <float.h>
 
@@ -33,6 +41,7 @@
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
+#define ccl_addr_space
 
 /* No assert supported for CUDA */
 
@@ -75,12 +84,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Use fast math functions */
 
-#define cosf(x) __cosf(((float)x))
-#define sinf(x) __sinf(((float)x))
-#define powf(x, y) __powf(((float)x), ((float)y))
-#define tanf(x) __tanf(((float)x))
-#define logf(x) __logf(((float)x))
-#define expf(x) __expf(((float)x))
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
 
 #endif /* __KERNEL_COMPAT_CUDA_H__ */
-
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 9e58ebff599..e8b36d2605d 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_OPENCL_H__
@@ -24,14 +24,6 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-#ifdef __KERNEL_OPENCL_AMD__
-#define __CL_NO_FLOAT3__
-#endif
-
-#ifdef __CL_NO_FLOAT3__
-#define float3 float4
-#endif
-
 #ifdef __CL_NOINLINE__
 #define ccl_noinline __attribute__((noinline))
 #else
@@ -45,6 +37,22 @@
 #define ccl_may_alias
 #define ccl_constant __constant
 #define ccl_global __global
+#define ccl_local __local
+#define ccl_private __private
+
+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
 
 /* no assert in opencl */
 #define kernel_assert(cond)
@@ -73,11 +81,7 @@
 #endif
 
 #define make_float2(x, y) ((float2)(x, y))
-#ifdef __CL_NO_FLOAT3__
-#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f))
-#else
 #define make_float3(x, y, z) ((float3)(x, y, z))
-#endif
 #define make_float4(x, y, z, w) ((float4)(x, y, z, w))
 #define make_int2(x, y) ((int2)(x, y))
 #define make_int3(x, y, z) ((int3)(x, y, z))
@@ -89,34 +93,34 @@
 #define __float_as_uint(x) as_uint(x)
 #define __int_as_float(x) as_float(x)
 #define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)x), ((float)y))
-#define fabsf(x) fabs(((float)x))
-#define copysignf(x, y) copysign(((float)x), ((float)y))
-#define asinf(x) asin(((float)x))
-#define acosf(x) acos(((float)x))
-#define atanf(x) atan(((float)x))
-#define floorf(x) floor(((float)x))
-#define ceilf(x) ceil(((float)x))
-#define hypotf(x, y) hypot(((float)x), ((float)y))
-#define atan2f(x, y) atan2(((float)x), ((float)y))
-#define fmaxf(x, y) fmax(((float)x), ((float)y))
-#define fminf(x, y) fmin(((float)x), ((float)y))
-#define fmodf(x, y) fmod((float)x, (float)y)
+#define powf(x, y) pow(((float)(x)), ((float)(y)))
+#define fabsf(x) fabs(((float)(x)))
+#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
+#define asinf(x) asin(((float)(x)))
+#define acosf(x) acos(((float)(x)))
+#define atanf(x) atan(((float)(x)))
+#define floorf(x) floor(((float)(x)))
+#define ceilf(x) ceil(((float)(x)))
+#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
+#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
+#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
+#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
+#define fmodf(x, y) fmod((float)(x), (float)(y))
 
 #ifndef __CL_USE_NATIVE__
-#define sinf(x) native_sin(((float)x))
-#define cosf(x) native_cos(((float)x))
-#define tanf(x) native_tan(((float)x))
-#define expf(x) native_exp(((float)x))
-#define sqrtf(x) native_sqrt(((float)x))
-#define logf(x) native_log(((float)x))
+#define sinf(x) native_sin(((float)(x)))
+#define cosf(x) native_cos(((float)(x)))
+#define tanf(x) native_tan(((float)(x)))
+#define expf(x) native_exp(((float)(x)))
+#define sqrtf(x) native_sqrt(((float)(x)))
+#define logf(x) native_log(((float)(x)))
 #else
-#define sinf(x) sin(((float)x))
-#define cosf(x) cos(((float)x))
-#define tanf(x) tan(((float)x))
-#define expf(x) exp(((float)x))
-#define sqrtf(x) sqrt(((float)x))
-#define logf(x) log(((float)x))
+#define sinf(x) sin(((float)(x)))
+#define cosf(x) cos(((float)(x)))
+#define tanf(x) tan(((float)(x)))
+#define expf(x) exp(((float)(x)))
+#define sqrtf(x) sqrt(((float)(x)))
+#define logf(x) log(((float)(x)))
 #endif
 
 /* data lookup defines */
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
new file mode 100644
index 00000000000..24d6458567e
--- /dev/null
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void debug_data_init(DebugData *debug_data)
+{
+	debug_data->num_bvh_traversal_steps = 0;
+	debug_data->num_bvh_traversed_instances = 0;
+	debug_data->num_ray_bounces = 0;
+}
+
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 ccl_addr_space PathState *state,
+                                                 DebugData *debug_data,
+                                                 int sample)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASS_BVH_TRAVERSAL_STEPS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
+		                        sample,
+		                        debug_data->num_bvh_traversal_steps);
+	}
+	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        sample,
+		                        debug_data->num_bvh_traversed_instances);
+	}
+	if(flag & PASS_RAY_BOUNCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        sample,
+		                        debug_data->num_ray_bounces);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index daba2d927b7..ae1e70f0167 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -11,14 +11,14 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
 	/* ray differential transfer through homogeneous medium, to
 	 * compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
 	dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }
 
-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
 {
 	/* compute dIdx/dy at a shading point, we just need to negate the
 	 * differential of the ray direction */
@@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
 	dI->dy = -dD.dy;
 }
 
-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
 	/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
 	 * from the primitive, we can compute dudx/dy and dvdx/dy. these are
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 4b2bb723ab6..de9e8d77ec8 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -11,18 +11,26 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
-
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	,ShaderData *sd_input
+#endif
+)
 {
 	/* setup shading at emitter */
-	ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+	ShaderData *sd = sd_input;
+#else
+	ShaderData sd_object;
+	ShaderData *sd = &sd_object;
+#endif
 	float3 eval;
 
 #ifdef __BACKGROUND_MIS__
@@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
-		shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
-		eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+		shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+		eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
 	}
 	else
 #endif
 	{
-		shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+		shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
 
-		ls->Ng = sd.Ng;
+		ls->Ng = ccl_fetch(sd, Ng);
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
-		shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 
 		/* evaluate emissive closure */
-		if(sd.flag & SD_EMISSION)
-			eval = shader_emissive_eval(kg, &sd);
+		if(ccl_fetch(sd, flag) & SD_EMISSION)
+			eval = shader_emissive_eval(kg, sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -65,7 +73,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
-	int bounce, int transparent_bounce)
+	int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	, ShaderData *sd_DL
+#endif
+	)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@@ -74,7 +86,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	differential3 dD = differential3_zero();
 
 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+	                                         bounce,
+	                                         transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	                                         ,sd_DL
+#endif
+	                                         );
 
 	if(is_zero(light_eval))
 		return false;
@@ -83,7 +102,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	float bsdf_pdf;
 
 #ifdef __VOLUME__
-	if(sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 	else
 		shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@@ -118,8 +137,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
-		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -132,7 +151,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -154,14 +173,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -172,7 +191,11 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+                                                ,ShaderData *sd
+#endif
+                                                )
 {
 	bool hit_lamp = false;
 
@@ -188,14 +211,21 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		/* use visibility flag to skip lights */
 		if(ls.shader & SHADER_EXCLUDE_ANY) {
 			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+			    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 			   ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 				continue;
 		}
 #endif
 
-		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+		                                state->bounce,
+		                                state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+		                                ,sd
+#endif
+		                                );
 
 #ifdef __VOLUME__
 		if(state->volume_stack[0].shader != SHADER_NONE) {
@@ -224,7 +254,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+                                               ,ShaderData *sd_global
+#endif
+                                               )
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;
@@ -232,18 +266,25 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	/* use visibility flag to skip lights */
 	if(shader & SHADER_EXCLUDE_ANY) {
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+		   ((shader & SHADER_EXCLUDE_GLOSSY) &&
+		    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
 		   ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+#ifdef __SPLIT_KERNEL__
 	/* evaluate background closure */
+	Ray priv_ray = *ray;
+	shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+	float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
 	ShaderData sd;
 	shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
 
 	float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif
 
 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
@@ -252,7 +293,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	if(!(state->flag & PATH_RAY_MIS_SKIP) && res) {
 		/* multiple importance sampling, get background light pdf for ray
 		 * direction, and compute weight with respect to BSDF pdf */
-		float pdf = background_light_pdf(kg, ray->D);
+		float pdf = background_light_pdf(kg, ray->P, ray->D);
 		float mis_weight = power_heuristic(state->ray_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index dc5f6e7ce38..f9e9b413898 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -27,7 +27,7 @@ ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 	result.z = color_scene_linear_to_srgb(result.z*exposure);
 
 	/* clamp since alpha might be > 1.0 due to russian roulette */
-	result.w = clamp(result.w, 0.0f, 1.0f);
+	result.w = saturate(result.w);
 
 	return result;
 }
@@ -37,10 +37,10 @@ ccl_device uchar4 film_float_to_byte(float4 color)
 	uchar4 result;
 
 	/* simple float to byte conversion */
-	result.x = (uchar)clamp(color.x*255.0f, 0.0f, 255.0f);
-	result.y = (uchar)clamp(color.y*255.0f, 0.0f, 255.0f);
-	result.z = (uchar)clamp(color.z*255.0f, 0.0f, 255.0f);
-	result.w = (uchar)clamp(color.w*255.0f, 0.0f, 255.0f);
+	result.x = (uchar)(saturate(color.x)*255.0f);
+	result.y = (uchar)(saturate(color.y)*255.0f);
+	result.z = (uchar)(saturate(color.z)*255.0f);
+	result.w = (uchar)(saturate(color.w)*255.0f);
 
 	return result;
 }
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 6bd2ec0662c..17fa18909c4 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Constant Globals */
@@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 
 #ifdef __KERNEL_OPENCL__
 
-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
 
 #define KERNEL_TEX(type, ttype, name) \
@@ -94,7 +94,7 @@ typedef struct KernelGlobals {
 
 ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
 {
-	x = clamp(x, 0.0f, 1.0f)*(size-1);
+	x = saturate(x)*(size-1);
 
 	int index = min(float_to_int(x), size-1);
 	int nindex = min(index+1, size-1);
@@ -110,7 +110,7 @@ ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int s
 
 ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
 {
-	y = clamp(y, 0.0f, 1.0f)*(ysize-1);
+	y = saturate(y)*(ysize-1);
 
 	int index = min(float_to_int(y), ysize-1);
 	int nindex = min(index+1, ysize-1);
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 2a5b7689e57..9ba41635b9e 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
@@ -47,6 +47,8 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 #  else
 	return a >> __builtin_ctz(b);
 #  endif
+#elif defined(__KERNEL_CUDA__)
+	return a >> (__ffs(b) - 1);
 #else
 	return a/b;
 #endif
@@ -63,6 +65,8 @@ ccl_device_inline uint cmj_w_mask(uint w)
 #  else
 	return ((1 << (32 - __builtin_clz(w))) - 1);
 #  endif
+#elif defined(__KERNEL_CUDA__)
+	return ((1 << (32 - __clz(w))) - 1);
 #else
 	w |= w >> 1;
 	w |= w >> 2;
@@ -124,7 +128,7 @@ ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
 			i *= 0xc860a3df;
 			i &= w;
 			i ^= i >> 5;
-		} while (i >= l);
+		} while(i >= l);
 
 		return (i + p) % l;
 	}
@@ -167,7 +171,11 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
 	kernel_assert(s < N);
 
+#if defined(__KERNEL_CUDA__)
+	int m = float_to_int(__fsqrt_ru(N));
+#else
 	int m = float_to_int(sqrtf(N));
+#endif
 	int n = (N + m - 1)/m;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index e7f62f230f8..7590ec2d706 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -33,11 +33,112 @@ typedef struct LightSample {
 	LightType type;		/* type of light */
 } LightSample;
 
+/* Area light sampling */
+
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ *
+ * Note: light_p is modified when sample_coord is true.
+ */
+ccl_device float area_light_sample(float3 P,
+                                   float3 *light_p,
+                                   float3 axisu, float3 axisv,
+                                   float randu, float randv,
+                                   bool sample_coord)
+{
+	/* In our name system we're using P for the center,
+	* which is o in the paper.
+	*/
+
+	float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = safe_acosf(-dot(n0, n1));
+	float g1 = safe_acosf(-dot(n1, n2));
+	float g2 = safe_acosf(-dot(n2, n3));
+	float g3 = safe_acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float b0 = n0.z;
+	float b1 = n2.z;
+	float b0sq = b0 * b0;
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+
+	if(sample_coord) {
+		/* Compute cu. */
+		float au = randu * S + k;
+		float fu = (cosf(au) * b0 - b1) / sinf(au);
+		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+		cu = clamp(cu, -1.0f, 1.0f);
+		/* Compute xu. */
+		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		xu = clamp(xu, x0, x1);
+		/* Compute yv. */
+		float z0sq = z0 * z0;
+		float y0sq = y0 * y0;
+		float y1sq = y1 * y1;
+		float d = sqrtf(xu * xu + z0sq);
+		float h0 = y0 / sqrtf(d * d + y0sq);
+		float h1 = y1 / sqrtf(d * d + y1sq);
+		float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+		float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+		/* Transform (xu, yv, z0) to world coords. */
+		*light_p = P + xu * x + yv * y + z0 * z;
+	}
+
+	/* return pdf */
+	if(S != 0.0f)
+		return 1.0f / S;
+	else
+		return 0.0f;
+}
+
 /* Background Light */
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+/* TODO(sergey): In theory it should be all fine to use noinline for all
+ * devices, but we're so close to the release so better not screw things
+ * up for CPU at least.
+ */
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
 {
 	/* for the following, the CDF values are actually a pair of floats, with the
 	 * function value as X and the actual CDF as Y.  The last entry's function
@@ -107,13 +208,19 @@ ccl_device float3 background_light_sample(KernelGlobals *kg, float randu, float
 	else
 		*pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
 
-	*pdf *= kernel_data.integrator.pdf_lights;
-
 	/* compute direction */
-	return -equirectangular_to_direction(u, v);
+	return equirectangular_to_direction(u, v);
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 direction)
+/* TODO(sergey): Same as above, after the release we should consider using
+ * 'noinline' for all devices.
+ */
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+float background_map_pdf(KernelGlobals *kg, float3 direction)
 {
 	float2 uv = direction_to_equirectangular(direction);
 	int res = kernel_data.integrator.pdf_background_res;
@@ -139,9 +246,223 @@ ccl_device float background_light_pdf(KernelGlobals *kg, float3 direction)
 	float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, index_v * (res + 1) + index_u);
 	float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
 
-	float pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+	return (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+}
+
+ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals *kg,
+                                                                   float3 P,
+                                                                   int index,
+                                                                   float3 *lightpos,
+                                                                   float3 *dir)
+{
+	float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0);
+	float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3);
+
+	*lightpos = make_float3(data0.y, data0.z, data0.w);
+	*dir = make_float3(data3.y, data3.z, data3.w);
+
+	/* Check whether portal is on the right side. */
+	if(dot(*dir, P - *lightpos) > 1e-5f)
+		return true;
+
+	return false;
+}
+
+ccl_device float background_portal_pdf(KernelGlobals *kg,
+                                       float3 P,
+                                       float3 direction,
+                                       int ignore_portal,
+                                       bool *is_possible)
+{
+	float portal_pdf = 0.0f;
+
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		if(p == ignore_portal)
+			continue;
+
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(is_possible) {
+			/* There's a portal that could be sampled from this position. */
+			*is_possible = true;
+		}
+
+		float t = -(dot(P, dir) - dot(lightpos, dir)) / dot(direction, dir);
+		if(t <= 1e-5f) {
+			/* Either behind the portal or too close. */
+			continue;
+		}
+
+		float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+		float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+
+		float3 axisu = make_float3(data1.y, data1.z, data1.w);
+		float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+		float3 hit = P + t*direction;
+		float3 inplane = hit - lightpos;
+		/* Skip if the the ray doesn't pass through portal. */
+		if(fabsf(dot(inplane, axisu) / dot(axisu, axisu)) > 0.5f)
+			continue;
+		if(fabsf(dot(inplane, axisv) / dot(axisv, axisv)) > 0.5f)
+			continue;
+
+		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+	}
 
-	return pdf * kernel_data.integrator.pdf_lights;
+	return kernel_data.integrator.num_portals? portal_pdf / kernel_data.integrator.num_portals: 0.0f;
+}
+
+ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+{
+	int num_possible_portals = 0;
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		float3 lightpos, dir;
+		if(background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			num_possible_portals++;
+	}
+	return num_possible_portals;
+}
+
+ccl_device float3 background_portal_sample(KernelGlobals *kg,
+                                           float3 P,
+                                           float randu,
+                                           float randv,
+                                           int num_possible,
+                                           int *sampled_portal,
+                                           float *pdf)
+{
+	/* Pick a portal, then re-normalize randv. */
+	randv *= num_possible;
+	int portal = (int)randv;
+	randv -= portal;
+
+	/* TODO(sergey): Some smarter way of finding portal to sample
+	 * is welcome.
+	 */
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		/* Search for the sampled portal. */
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(portal == 0) {
+			/* p is the portal to be sampled. */
+			float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+			float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+			float3 axisu = make_float3(data1.y, data1.z, data1.w);
+			float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+			*pdf = area_light_sample(P, &lightpos,
+			                         axisu, axisv,
+			                         randu, randv,
+			                         true);
+
+			*pdf /= num_possible;
+			*sampled_portal = p;
+			return normalize(lightpos - P);
+		}
+
+		portal--;
+	}
+
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	/* Check if there are portals in the scene which we can sample. */
+	if(portal_sampling_pdf > 0.0f) {
+		int num_portals = background_num_possible_portals(kg, P);
+		if(num_portals > 0) {
+			if(portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) {
+				if(portal_sampling_pdf < 1.0f) {
+					randu /= portal_sampling_pdf;
+				}
+				int portal;
+				float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
+				if(num_portals > 1) {
+					/* Ignore the chosen portal, its pdf is already included. */
+					*pdf += background_portal_pdf(kg, P, D, portal, NULL);
+				}
+				/* We could also have sampled the map, so combine with MIS. */
+				if(portal_sampling_pdf < 1.0f) {
+					float cdf_pdf = background_map_pdf(kg, D);
+					*pdf = (portal_sampling_pdf * (*pdf)
+					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
+				}
+				return D;
+			} else {
+				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
+				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
+			}
+		} else {
+			/* We can't sample a portal.
+			 * Check if we can sample the map instead.
+			 */
+			if(portal_sampling_pdf == 1.0f) {
+				/* Use uniform as a fallback if we can't sample the map. */
+				*pdf = 1.0f / M_4PI_F;
+				return sample_uniform_sphere(randu, randv);
+			}
+			else {
+				portal_sampling_pdf = 0.0f;
+			}
+		}
+	}
+
+	float3 D = background_map_sample(kg, randu, randv, pdf);
+	/* Use MIS if portals could be sampled as well. */
+	if(portal_sampling_pdf > 0.0f) {
+		float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL);
+		*pdf = (portal_sampling_pdf * portal_pdf
+		     + (1.0f - portal_sampling_pdf) * (*pdf));
+	}
+	return D;
+}
+
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	if(portal_sampling_pdf > 0.0f) {
+		bool is_possible = false;
+		float portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible);
+		if(portal_pdf == 0.0f) {
+			if(portal_sampling_pdf == 1.0f) {
+				/* If there are no possible portals at this point,
+				 * the fallback sampling would have been used.
+				 * Otherwise, the direction would not be sampled at all => pdf = 0
+				 */
+				return is_possible? 0.0f: kernel_data.integrator.pdf_lights / M_4PI_F;
+			}
+			else {
+				/* We can only sample the map. */
+				return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
+			}
+		} else {
+			if(portal_sampling_pdf == 1.0f) {
+				/* We can only sample portals. */
+				return portal_pdf * kernel_data.integrator.pdf_lights;
+			}
+			else {
+				/* We can sample both, so combine with MIS. */
+				return (background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf)
+				      + portal_pdf * portal_sampling_pdf) * kernel_data.integrator.pdf_lights;
+			}
+		}
+	}
+
+	/* No portals in the scene, so must sample the map.
+	 * At least one of them is always possible if we have a LIGHT_BACKGROUND.
+	 */
+	return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
 }
 #endif
 
@@ -167,14 +488,6 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
-{
-	randu = randu - 0.5f;
-	randv = randv - 0.5f;
-
-	return axisu*randu + axisv*randv;
-}
-
 ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
 {
 	float3 dir = make_float3(data2.y, data2.z, data2.w);
@@ -245,13 +558,14 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
 		/* infinite area light (e.g. light dome or env light) */
-		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
+		float3 D = -background_light_sample(kg, P, randu, randv, &ls->pdf);
 
 		ls->P = D;
 		ls->Ng = D;
 		ls->D = -D;
 		ls->t = FLT_MAX;
 		ls->eval_fac = 1.0f;
+		ls->pdf *= kernel_data.integrator.pdf_lights;
 	}
 #endif
 	else {
@@ -276,6 +590,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
 				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
 			}
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
 			/* area light */
@@ -286,22 +601,31 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 			float3 axisv = make_float3(data2.y, data2.z, data2.w);
 			float3 D = make_float3(data3.y, data3.z, data3.w);
 
-			ls->P += area_light_sample(axisu, axisv, randu, randv);
+			ls->pdf = area_light_sample(P, &ls->P,
+			                          axisu, axisv,
+			                          randu, randv,
+			                          true);
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
 			float invarea = data2.x;
-
 			ls->eval_fac = 0.25f*invarea;
-			ls->pdf = invarea;
+
+			if(dot(ls->D, D) > 0.0f)
+				ls->pdf = 0.0f;
 		}
 
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 }
 
-ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ >= 500) && (defined(i386) || defined(_M_IX86))
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -355,6 +679,7 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->D = D;
 		ls->t = FLT_MAX;
 
+		/* compute pdf */
 		float invarea = data1.w;
 		ls->pdf = invarea/(costheta*costheta*costheta);
 		ls->eval_fac = ls->pdf;
@@ -386,6 +711,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+
+		/* compute pdf */
+		if(ls->t != FLT_MAX)
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
@@ -404,24 +733,20 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		if(dot(D, Ng) >= 0.0f)
 			return false;
 
-		ls->P = make_float3(data0.y, data0.z, data0.w);
+		float3 light_P = make_float3(data0.y, data0.z, data0.w);
 
 		if(!ray_quad_intersect(P, D, t,
-			ls->P, axisu, axisv, &ls->P, &ls->t))
+			light_P, axisu, axisv, &ls->P, &ls->t))
 			return false;
 
 		ls->D = D;
 		ls->Ng = Ng;
-		ls->pdf = invarea;
-		ls->eval_fac = 0.25f*ls->pdf;
+		ls->pdf = area_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
+		ls->eval_fac = 0.25f*invarea;
 	}
 	else
 		return false;
 
-	/* compute pdf */
-	if(ls->t != FLT_MAX)
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-
 	return true;
 }
 
@@ -514,7 +839,13 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+{
+	float4 data4 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 4);
+	return (bounce > __float_as_int(data4.x));
+}
+
+ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, int bounce, LightSample *ls)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -536,6 +867,12 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 	}
 	else {
 		int lamp = -prim-1;
+
+		if(UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
 		lamp_light_sample(kg, lamp, randu, randv, P, ls);
 	}
 }
@@ -546,22 +883,5 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
-{
-	/* sample index */
-	int index = light_distribution_sample(kg, randt);
-
-	/* fetch light data */
-	float4 l = kernel_tex_fetch(__light_distribution, index);
-	int prim = __float_as_int(l.y);
-
-	if(prim < 0) {
-		int lamp = -prim-1;
-		return lamp;
-	}
-	else
-		return LAMP_NONE;
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index d95a5c76309..9e14d8cc7cb 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_MATH_H__
@@ -19,6 +19,7 @@
 
 #include "util_color.h"
 #include "util_math.h"
+#include "util_math_fast.h"
 #include "util_transform.h"
 #include "util_distort.h"
 
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index b3b6fc02894..20cf3fa931b 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	atomic_add_float(buf, value);
+#else
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+#else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+	ccl_global float *buf_w = buffer + 3;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+	atomic_add_float(buf_w, value.w);
+#else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, PathState *state, float3 throughput)
+	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(sd->flag & SD_TRANSPARENT) ||
+		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, sd->P);
+					float depth = camera_distance(kg, ccl_fetch(sd, P));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, sd->object);
+					float id = object_pass_id(kg, ccl_fetch(sd, object));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = sd->N;
+				float3 normal = ccl_fetch(sd, N);
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -101,8 +127,8 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, sd->P);
-		float mist = clamp((depth - mist_start)*mist_inv_depth, 0.0f, 1.0f);
+		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
 		float mist_falloff = kernel_data.film.mist_falloff;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index b8994d940fd..9794ad1d180 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifdef __OSL__
@@ -42,9 +42,14 @@
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
 #include "kernel_emission.h"
+#include "kernel_path_common.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"
 
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
@@ -113,7 +118,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
+						throughput, &state, L, all, &volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
@@ -126,9 +131,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 						rphase, rscatter, &volume_segment, NULL, true);
 				}
 
-				if(result != VOLUME_PATH_SCATTERED)
-					throughput *= volume_segment.accum_transmittance;
-
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
@@ -138,6 +140,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 					else
 						break;
 				}
+				else {
+					throughput *= volume_segment.accum_transmittance;
+				}
 			}
 			else
 #endif
@@ -269,8 +274,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				float bssrdf_u, bssrdf_v;
 				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-
-				state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			}
 		}
 #endif
@@ -303,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(sd->P, sd->Ng);
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+		light_ray.time = ccl_fetch(sd, time);
 #endif
-		light_ray.dP = sd->dP;
+		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@@ -321,78 +324,8 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 	}
 }
 
-ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
-{
-	int num_samples = kernel_data.integrator.ao_samples;
-	float num_samples_inv = 1.0f/num_samples;
-	float ao_factor = kernel_data.background.ao_factor;
-	float3 ao_N;
-	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-	for(int j = 0; j < num_samples; j++) {
-		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-		float3 ao_D;
-		float ao_pdf;
-
-		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-			Ray light_ray;
-			float3 ao_shadow;
-
-			light_ray.P = ray_offset(sd->P, sd->Ng);
-			light_ray.D = ao_D;
-			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
-#endif
-			light_ray.dP = sd->dP;
-			light_ray.dD = differential3_zero();
-
-			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
-		}
-	}
-}
-
 #ifdef __SUBSURFACE__
 
-#ifdef __VOLUME__
-ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg,
-                                                           Ray *ray,
-                                                           VolumeStack *stack)
-{
-	kernel_assert(kernel_data.integrator.use_volumes);
-
-	Ray volume_ray = *ray;
-	Intersection isect;
-	const float3 Pend = volume_ray.P + volume_ray.D*volume_ray.t;
-
-	while(
-		scene_intersect(kg, &volume_ray, PATH_RAY_ALL_VISIBILITY,
-		                &isect, NULL, 0.0f, 0.0f)) {
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		kernel_volume_stack_enter_exit(kg, &sd, stack);
-
-		/* Move ray forward. */
-		volume_ray.P = ray_offset(sd.P, -sd.Ng);
-		volume_ray.D = normalize_len(Pend - volume_ray.P,
-		                             &volume_ray.t);
-
-		/* TODO(sergey): Find a faster way detecting that ray_offset moved
-		 * us pass through the end point.
-		 */
-		if(dot(ray->D, volume_ray.D) < 0.0f) {
-			break;
-		}
-	}
-}
-#endif
-
 ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
 {
 	float bssrdf_probability;
@@ -411,6 +344,8 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 		int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
 #ifdef __VOLUME__
 		Ray volume_ray = *ray;
+		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+		                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
 		/* compute lighting with the BSDF closure */
@@ -419,7 +354,6 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 			PathState hit_state = *state;
 			Ray hit_ray = *ray;
 
-			hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			hit_state.rng_offset += PRNG_BOUNCE_NUM;
 			
 			kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
@@ -430,12 +364,12 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 #endif
 
 #ifdef __VOLUME__
-				if(kernel_data.integrator.use_volumes) {
+				if(need_update_volume_stack) {
 					/* Setup ray from previous surface point to the new one. */
 					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
 					                             &volume_ray.t);
 
-					kernel_path_subsurface_update_volume_stack(
+					kernel_volume_stack_update_for_subsurface(
 					    kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
@@ -471,6 +405,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	PathState state;
 	path_state_init(kg, &state, rng, sample, &ray);
 
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
+
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
@@ -497,6 +436,14 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
 
+#ifdef __KERNEL_DEBUG__
+		if(state.flag & PATH_RAY_CAMERA) {
+			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		}
+		debug_data.num_ray_bounces++;
+#endif
+
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
 			/* ray starting from previous non-transparent bounce */
@@ -553,7 +500,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+						throughput, &state, &L, all, &volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
@@ -566,9 +513,6 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 						rphase, rscatter, &volume_segment, NULL, true);
 				}
 
-				if(result != VOLUME_PATH_SCATTERED)
-					throughput *= volume_segment.accum_transmittance;
-
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
@@ -578,6 +522,9 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 					else
 						break;
 				}
+				else {
+					throughput *= volume_segment.accum_transmittance;
+				}
 			}
 			else 
 #endif
@@ -717,460 +664,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
-}
-
-#ifdef __BRANCHED_PATH__
-
-/* branched path tracing: bounce off surface and integrate indirect light */
-ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
-	PathState *state, PathRadiance *L)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
-		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-			continue;
-
-		int num_samples;
-
-		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-			num_samples = kernel_data.integrator.diffuse_samples;
-		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
-			num_samples = 1;
-		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
-			num_samples = kernel_data.integrator.glossy_samples;
-		else
-			num_samples = kernel_data.integrator.transmission_samples;
-
-		num_samples = ceil_to_int(num_samples_adjust*num_samples);
-
-		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
-
-		for(int j = 0; j < num_samples; j++) {
-			PathState ps = *state;
-			float3 tp = throughput;
-			Ray bsdf_ray;
-
-			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
-				continue;
-
-			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
-
-			/* for render passes, sum and reset indirect light pass variables
-			 * for the next samples */
-			path_radiance_sum_indirect(L);
-			path_radiance_reset_indirect(L);
-		}
-	}
-}
-
-#ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        RNG *rng,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSSRDF(sc->type))
-			continue;
-
-		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-		int num_samples = kernel_data.integrator.subsurface_samples;
-		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
-
-		state->flag |= PATH_RAY_BSSRDF_ANCESTOR;
-
-		/* do subsurface scatter step with copy of shader data, this will
-		 * replace the BSSRDF with a diffuse BSDF closure */
-		for(int j = 0; j < num_samples; j++) {
-			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-#ifdef __VOLUME__
-			Ray volume_ray = *ray;
-#endif
-
-			/* compute lighting with the BSDF closure */
-			for(int hit = 0; hit < num_hits; hit++) {
-				PathState hit_state = *state;
-
-				path_state_branch(&hit_state, j, num_samples);
-
-#ifdef __VOLUME__
-				if(kernel_data.integrator.use_volumes) {
-					/* Setup ray from previous surface point to the new one. */
-					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
-					volume_ray.D = normalize_len(P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_path_subsurface_update_volume_stack(
-					    kg,
-					    &volume_ray,
-					    hit_state.volume_stack);
-
-					/* Move volume ray forward. */
-					volume_ray.P = P;
-				}
-#endif
-
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
-				/* direct light */
-				if(kernel_data.integrator.use_direct_light) {
-					bool all = kernel_data.integrator.sample_all_lights_direct;
-					kernel_branched_path_surface_connect_light(kg, rng,
-						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
-				}
-#endif
-
-				/* indirect light */
-				kernel_branched_path_surface_indirect_light(kg, rng,
-					&bssrdf_sd[hit], throughput, num_samples_inv,
-					&hit_state, L);
-			}
-		}
-
-		state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
-	}
-}
-#endif
-
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
-{
-	/* initialize */
-	PathRadiance L;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
-
-	PathState state;
-	path_state_init(kg, &state, rng, sample, &ray);
-
-	for(;;) {
-		/* intersect scene */
-		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
-
-#ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-			ShaderData volume_sd;
-
-			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-					throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-						
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				ShaderData volume_sd;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
-				
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
-#endif
-		}
-#endif
-
-		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
-
-			break;
-		}
-
-		/* setup shading */
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
-		shader_merge_closures(&sd);
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
-		}
-#endif
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
-		}
-#endif
-
-		/* transparency termination */
-		if(state.flag & PATH_RAY_TRANSPARENT) {
-			/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
-
-			if(probability == 0.0f) {
-				break;
-			}
-			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
-
-				if(terminate >= probability)
-					break;
-
-				throughput /= probability;
-			}
-		}
-
-#ifdef __AO__
-		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
-		}
-#endif
-
-#ifdef __SUBSURFACE__
-		/* bssrdf scatter to a different location on the same object */
-		if(sd.flag & SD_BSSRDF) {
-			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
-			                                        rng, &ray, throughput);
-		}
-#endif
-
-		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-			PathState hit_state = state;
-
-#ifdef __EMISSION__
-			/* direct light */
-			if(kernel_data.integrator.use_direct_light) {
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &hit_state, throughput, 1.0f, &L, all);
-			}
-#endif
-
-			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, throughput, 1.0f, &hit_state, &L);
-
-			/* continue in case of transparency */
-			throughput *= shader_bsdf_transparency(kg, &sd);
-
-			if(is_zero(throughput))
-				break;
-		}
-
-		path_state_next(kg, &state, LABEL_TRANSPARENT);
-		ray.P = ray_offset(sd.P, -sd.Ng);
-		ray.t -= sd.ray_length; /* clipping works through transparent */
-
-
-#ifdef __RAY_DIFFERENTIALS__
-		ray.dP = sd.dP;
-		ray.dD.dx = -sd.dI.dx;
-		ray.dD.dy = -sd.dI.dy;
-#endif
-
-#ifdef __VOLUME__
-		/* enter/exit volume */
-		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif
-	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
-#endif
-
-ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
-{
-	float filter_u;
-	float filter_v;
-
-	int num_samples = kernel_data.integrator.aa_samples;
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
-
-	/* sample camera ray */
-
-	float lens_u = 0.0f, lens_v = 0.0f;
-
-	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-	float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
-#endif
-
-	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
 ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
@@ -1202,38 +702,5 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
-#ifdef __BRANCHED_PATH__
-ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
-	int sample, int x, int y, int offset, int stride)
-{
-	/* buffer offset */
-	int index = offset + x + y*stride;
-	int pass_stride = kernel_data.film.pass_stride;
-
-	rng_state += index;
-	buffer += index*pass_stride;
-
-	/* initialize random numbers and ray */
-	RNG rng;
-	Ray ray;
-
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
-
-	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
-
-	path_rng_end(kg, rng_state, rng);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
new file mode 100644
index 00000000000..b6d64985f6a
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+	int num_samples = kernel_data.integrator.ao_samples;
+	float num_samples_inv = 1.0f/num_samples;
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+	for(int j = 0; j < num_samples; j++) {
+		float bsdf_u, bsdf_v;
+		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+		float3 ao_D;
+		float ao_pdf;
+
+		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			Ray light_ray;
+			float3 ao_shadow;
+
+			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.D = ao_D;
+			light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dD = differential3_zero();
+
+			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
+	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
+	PathState *state, PathRadiance *L)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(*rng, i);
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			float3 tp = throughput;
+			Ray bsdf_ray;
+
+			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
+				continue;
+
+			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+
+			/* for render passes, sum and reset indirect light pass variables
+			 * for the next samples */
+			path_radiance_sum_indirect(L);
+			path_radiance_reset_indirect(L);
+		}
+	}
+}
+
+#ifdef __SUBSURFACE__
+ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        PathRadiance *L,
+                                                        PathState *state,
+                                                        RNG *rng,
+                                                        Ray *ray,
+                                                        float3 throughput)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(*rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = 0; j < num_samples; j++) {
+			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
+#ifdef __VOLUME__
+			Ray volume_ray = *ray;
+			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+#endif
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = 0; hit < num_hits; hit++) {
+				PathState hit_state = *state;
+
+				path_state_branch(&hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    &volume_ray,
+					    hit_state.volume_stack);
+
+					/* Move volume ray forward. */
+					volume_ray.P = P;
+				}
+#endif
+
+#ifdef __EMISSION__
+				/* direct light */
+				if(kernel_data.integrator.use_direct_light) {
+					bool all = kernel_data.integrator.sample_all_lights_direct;
+					kernel_branched_path_surface_connect_light(kg, rng,
+						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+				}
+#endif
+
+				/* indirect light */
+				kernel_branched_path_surface_indirect_light(kg, rng,
+					&bssrdf_sd[hit], throughput, num_samples_inv,
+					&hit_state, L);
+			}
+		}
+	}
+}
+#endif
+
+ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+{
+	/* initialize */
+	PathRadiance L;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	float L_transparent = 0.0f;
+
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+	PathState state;
+	path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
+
+	/* Main Loop
+	 * Here we only handle transparency intersections from the camera ray.
+	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
+	 */
+	for(;;) {
+		/* intersect scene */
+		Intersection isect;
+		uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+		float difl = 0.0f, extmax = 0.0f;
+		uint lcg_state = 0;
+
+		if(kernel_data.bvh.have_curves) {
+			if(kernel_data.cam.resolution == 1) {
+				float3 pixdiff = ray.dD.dx + ray.dD.dy;
+				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+			}
+
+			extmax = kernel_data.curve.maximum_width;
+			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+		}
+
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+#else
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data.num_ray_bounces++;
+#endif
+
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+			
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			/* decoupled ray marching only supported on CPU */
+
+			/* cache steps along volume for repeated sampling */
+			VolumeSegment volume_segment;
+			ShaderData volume_sd;
+
+			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+			kernel_volume_decoupled_record(kg, &state,
+				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+			/* direct light sampling */
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+
+				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					throughput, &state, &L, all, &volume_ray, &volume_segment);
+
+				/* indirect light sampling */
+				int num_samples = kernel_data.integrator.volume_samples;
+				float num_samples_inv = 1.0f/num_samples;
+
+				for(int j = 0; j < num_samples; j++) {
+					/* workaround to fix correlation bug in T38710, can find better solution
+					 * in random number generator later, for now this is done here to not impact
+					 * performance of rendering without volumes */
+					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+					PathState ps = state;
+					Ray pray = ray;
+					float3 tp = throughput;
+
+					/* branch RNG state */
+					path_state_branch(&ps, j, num_samples);
+
+					/* scatter sample. if we use distance sampling and take just one
+					 * sample for direct and indirect light, we could share this
+					 * computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						
+					(void)result;
+					kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+			}
+
+			/* emission and transmittance */
+			if(volume_segment.closure_flag & SD_EMISSION)
+				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+			throughput *= volume_segment.accum_transmittance;
+
+			/* free cached steps */
+			kernel_volume_decoupled_free(kg, &volume_segment);
+#else
+			/* GPU: no decoupled ray marching, scatter probalistically */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			/* todo: we should cache the shader evaluations from stepping
+			 * through the volume, for now we redo them multiple times */
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = state;
+				Ray pray = ray;
+				ShaderData volume_sd;
+				float3 tp = throughput * num_samples_inv;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
+				
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* todo: support equiangular, MIS and all light sampling.
+					 * alternatively get decoupled ray marching working on the GPU */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+#endif
+			}
+
+			/* todo: avoid this calculation using decoupled ray marching */
+			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
+#endif
+		}
+#endif
+
+		if(!hit) {
+			/* eval background shader if nothing hit */
+			if(kernel_data.background.transparent) {
+				L_transparent += average(throughput);
+
+#ifdef __PASSES__
+				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+					break;
+			}
+
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, &state, &ray);
+			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+#endif
+
+			break;
+		}
+
+		/* setup shading */
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
+		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_merge_closures(&sd);
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+				
+				if(sd.flag & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, &sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent += average(holdout_weight*throughput);
+			}
+
+			if(sd.flag & SD_HOLDOUT_MASK)
+				break;
+		}
+#endif
+
+		/* holdout mask objects do not write data passes */
+		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+
+#ifdef __EMISSION__
+		/* emission */
+		if(sd.flag & SD_EMISSION) {
+			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
+			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		}
+#endif
+
+		/* transparency termination */
+		if(state.flag & PATH_RAY_TRANSPARENT) {
+			/* path termination. this is a strange place to put the termination, it's
+			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+			 * shader evaluations, only need emission if we are going to terminate */
+			float probability = path_state_terminate_probability(kg, &state, throughput);
+
+			if(probability == 0.0f) {
+				break;
+			}
+			else if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+
+				if(terminate >= probability)
+					break;
+
+				throughput /= probability;
+			}
+		}
+
+#ifdef __AO__
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
+			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
+		}
+#endif
+
+#ifdef __SUBSURFACE__
+		/* bssrdf scatter to a different location on the same object */
+		if(sd.flag & SD_BSSRDF) {
+			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
+			                                        rng, &ray, throughput);
+		}
+#endif
+
+		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+			PathState hit_state = state;
+
+#ifdef __EMISSION__
+			/* direct light */
+			if(kernel_data.integrator.use_direct_light) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_surface_connect_light(kg, rng,
+					&sd, &hit_state, throughput, 1.0f, &L, all);
+			}
+#endif
+
+			/* indirect light */
+			kernel_branched_path_surface_indirect_light(kg, rng,
+				&sd, throughput, 1.0f, &hit_state, &L);
+
+			/* continue in case of transparency */
+			throughput *= shader_bsdf_transparency(kg, &sd);
+
+			if(is_zero(throughput))
+				break;
+		}
+
+		/* Update Path State */
+		state.flag |= PATH_RAY_TRANSPARENT;
+		state.transparent_bounce++;
+
+		ray.P = ray_offset(sd.P, -sd.Ng);
+		ray.t -= sd.ray_length; /* clipping works through transparent */
+
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray.dP = sd.dP;
+		ray.dD.dx = -sd.dI.dx;
+		ray.dD.dy = -sd.dI.dy;
+#endif
+
+#ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
+#endif
+	}
+
+	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+
+	kernel_write_light_passes(kg, buffer, &L, sample);
+
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
+	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+}
+
+ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
+	ccl_global float *buffer, ccl_global uint *rng_state,
+	int sample, int x, int y, int offset, int stride)
+{
+	/* buffer offset */
+	int index = offset + x + y*stride;
+	int pass_stride = kernel_data.film.pass_stride;
+
+	rng_state += index;
+	buffer += index*pass_stride;
+
+	/* initialize random numbers and ray */
+	RNG rng;
+	Ray ray;
+
+	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+
+	/* integrate */
+	float4 L;
+
+	if(ray.t != 0.0f)
+		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
+	else
+		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+	/* accumulate result in output buffer */
+	kernel_write_pass_float4(buffer, sample, L);
+
+	path_rng_end(kg, rng_state, rng);
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
new file mode 100644
index 00000000000..1912dfa16ed
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
+                                               ccl_global uint *rng_state,
+                                               int sample,
+                                               int x, int y,
+                                               ccl_addr_space RNG *rng,
+                                               ccl_addr_space Ray *ray)
+{
+	float filter_u;
+	float filter_v;
+
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+
+	/* sample camera ray */
+
+	float lens_u = 0.0f, lens_v = 0.0f;
+
+	if(kernel_data.cam.aperturesize > 0.0f)
+		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+
+	float time = 0.0f;
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.shuttertime != -1.0f)
+		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+#endif
+
+	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index f29168642a4..15efb2371de 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -11,12 +11,12 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
@@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
 {
 	/* ray through transparent keeps same flags from previous ray and is
 	 * not counted as a regular bounce, transparent has separate max */
@@ -106,7 +106,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 			state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else if(label & LABEL_GLOSSY) {
-			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR;
+			state->flag |= PATH_RAY_GLOSSY;
 			state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else {
@@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
 		/* transparent rays treated separately */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 9553c2da0df..fe85a6b6e4b 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -32,12 +32,15 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+			   continue;
+
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
@@ -50,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -82,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -103,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -146,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 	ray->D = bsdf_omega_in;
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = sd->dP;
+	ray->dP = ccl_fetch(sd, dP);
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = sd->time;
+	ray->time = ccl_fetch(sd, time);
 #endif
 
 #ifdef __VOLUME__
@@ -178,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
+#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
 		return;
 
 	/* sample illumination from lights to find path contribution */
@@ -196,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
@@ -213,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	}
 #endif
 }
+#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(sd->flag & SD_BSDF) {
+	if(ccl_fetch(sd, flag) & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -251,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 		ray->D = bsdf_omega_in;
 
 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -272,16 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
+		if(state->bounce == 0)
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+		else
+			ray->t = FLT_MAX;
+
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index da2d5e6eca8..82dc0f97622 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -40,7 +40,7 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	light_ray.time = sd->time;
 #endif
 
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 	if(ls.pdf == 0.0f)
 		return;
 	
@@ -56,7 +56,12 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 #endif
 }
 
-ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
 {
 	/* sample phase function */
@@ -102,7 +107,7 @@ ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 
 ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
-	float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -119,8 +124,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+				continue;
+
+			int num_samples = light_select_num_samples(kg, i);
+			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
 
 			if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -166,8 +174,8 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 		/* mesh light sampling */
 		if(kernel_data.integrator.pdf_triangles != 0.0f) {
-			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
-			float num_samples_inv = num_samples_adjust/num_samples;
+			int num_samples = kernel_data.integrator.mesh_light_samples;
+			float num_samples_inv = 1.0f/num_samples;
 
 			if(kernel_data.integrator.num_all_lights)
 				num_samples_inv *= 0.5f;
@@ -183,7 +191,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
@@ -198,7 +206,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 				if(ls.pdf == 0.0f)
 					continue;
@@ -222,7 +230,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
@@ -237,7 +245,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 		if(ls.pdf == 0.0f)
 			return;
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 6744471d659..62922df3286 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -55,18 +55,18 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 /* Equirectangular coordinates <-> Cartesian direction */
 
-ccl_device float2 direction_to_equirectangular(float3 dir)
+ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
-	float u = -atan2f(dir.y, dir.x)/(M_2PI_F) + 0.5f;
-	float v = atan2f(dir.z, hypotf(dir.x, dir.y))/M_PI_F + 0.5f;
+	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
+	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
 	return make_float2(u, v);
 }
 
-ccl_device float3 equirectangular_to_direction(float u, float v)
+ccl_device float3 equirectangular_range_to_direction(float u, float v, float4 range)
 {
-	float phi = M_PI_F*(1.0f - 2.0f*u);
-	float theta = M_PI_F*(1.0f - v);
+	float phi = range.x*u + range.y;
+	float theta = range.z*v + range.w;
 
 	return make_float3(
 		sinf(theta)*cosf(phi),
@@ -74,6 +74,16 @@ ccl_device float3 equirectangular_to_direction(float u, float v)
 		cosf(theta));
 }
 
+ccl_device float2 direction_to_equirectangular(float3 dir)
+{
+	return direction_to_equirectangular_range(dir, make_float4(-M_2PI_F, M_PI_F, -M_PI_F, M_PI_F));
+}
+
+ccl_device float3 equirectangular_to_direction(float u, float v)
+{
+	return equirectangular_range_to_direction(u, v, make_float4(-M_2PI_F, M_PI_F, -M_PI_F, M_PI_F));
+}
+
 /* Fisheye <-> Cartesian direction */
 
 ccl_device float2 direction_to_fisheye(float3 dir, float fov)
@@ -153,6 +163,10 @@ ccl_device float3 mirrorball_to_direction(float u, float v)
 
 	dir.x = 2.0f*u - 1.0f;
 	dir.z = 2.0f*v - 1.0f;
+
+	if(dir.x*dir.x + dir.z*dir.z > 1.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
 	dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
 
 	/* reflection */
@@ -180,7 +194,9 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return equirectangular_to_direction(u, v);
+			return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return mirrorball_to_direction(u, v);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
@@ -194,7 +210,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return direction_to_equirectangular(dir);
+			return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return direction_to_mirrorball(dir);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
new file mode 100644
index 00000000000..cf5614b8a86
--- /dev/null
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_QUEUE_H__
+#define __KERNEL_QUEUE_H__
+
+/*
+ * Queue utility functions for split kernel
+ */
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+/*
+ * Enqueue ray index into the queue
+ */
+ccl_device void enqueue_ray_index(
+        int ray_index,                /* Ray index to be enqueued. */
+        int queue_number,             /* Queue in which the ray index should be enqueued. */
+        ccl_global int *queues,       /* Buffer of all queues. */
+        int queue_size,               /* Size of each queue. */
+        ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
+{
+	/* This thread's queue index. */
+	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	queues[my_queue_index] = ray_index;
+}
+
+/*
+ * Get the ray index for this thread
+ * Returns a positive ray_index for threads that have to do some work;
+ * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
+ * i.e All ray's in the queue has been successfully allocated and there
+ * is no more ray to allocate to other threads.
+ */
+ccl_device int get_ray_index(
+        int thread_index,       /* Global thread index. */
+        int queue_number,       /* Queue to operate on. */
+        ccl_global int *queues, /* Buffer of all queues. */
+        int queuesize,          /* Size of a queue. */
+        int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
+{
+	int ray_index = queues[queue_number * queuesize + thread_index];
+	if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
+		queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+	return ray_index;
+}
+
+/* The following functions are to realize Local memory variant of enqueue ray index function. */
+
+/* All threads should call this function. */
+ccl_device void enqueue_ray_index_local(
+        int ray_index,                               /* Ray index to enqueue. */
+        int queue_number,                            /* Queue in which to enqueue ray index. */
+        char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
+        int queuesize,                               /* queue size. */
+        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_global int *Queue_data,                  /* Queues. */
+        ccl_global int *Queue_index)                 /* To do global queue atomics. */
+{
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	/* Get local queue id .*/
+	unsigned int lqidx;
+	if(enqueue_flag) {
+		lqidx = atomic_inc(local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue offset. */
+	if(lidx == 0) {
+		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue index and enqueue ray. */
+	if(enqueue_flag) {
+		unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
+
+ccl_device unsigned int get_local_queue_index(
+        int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
+        ccl_local unsigned int *local_queue_atomics)
+{
+	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	return my_lqidx;
+}
+
+ccl_device unsigned int get_global_per_queue_offset(
+        int queue_number,
+        ccl_local unsigned int *local_queue_atomics,
+        ccl_global int* global_queue_atomics)
+{
+	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
+	                                       local_queue_atomics[queue_number]);
+	return queue_offset;
+}
+
+ccl_device unsigned int get_global_queue_index(
+    int queue_number,
+    int queuesize,
+    unsigned int lqidx,
+    ccl_local unsigned int * global_per_queue_offset)
+{
+	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
+	return my_gqidx;
+}
+
+#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 236f74c0a82..631a2cb75de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "kernel_jitter.h"
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
 #endif
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index db08c328d7e..6b560f5fdb2 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /*
@@ -37,13 +37,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(sd->flag & SD_OBJECT_MOTION) {
-		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
+	if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) {
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
+		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
 	}
 	else {
-		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
 {
 #ifdef __INSTANCING__
-	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	sd->type = isect->type;
-	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, type) = isect->type;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
 
-	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-	sd->ray_length = isect->t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+	ccl_fetch(sd, ray_length) = isect->t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __UV__
-	sd->u = isect->u;
-	sd->v = isect->v;
+	ccl_fetch(sd, u) = isect->u;
+	ccl_fetch(sd, v) = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
 
-		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
+		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 		/* vectors */
-		sd->P = triangle_refine(kg, sd, isect, ray);
-		sd->Ng = Ng;
-		sd->N = Ng;
+		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, Ng) = Ng;
+		ccl_fetch(sd, N) = Ng;
 		
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 #endif
 	}
 	else {
@@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	sd->I = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
 
-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
 #ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 #endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 	if(backfacing) {
-		sd->flag |= SD_BACKFACING;
-		sd->Ng = -sd->Ng;
-		sd->N = -sd->N;
+		ccl_fetch(sd, flag) |= SD_BACKFACING;
+		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-		sd->dPdu = -sd->dPdu;
-		sd->dPdv = -sd->dPdv;
+		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-	differential_incoming(&sd->dI, ray->dD);
-	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
+	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
+	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
 #endif
 }
 
@@ -166,7 +166,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 	/* fetch triangle data */
 	if(sd->type == PRIMITIVE_TRIANGLE) {
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* static triangle */
 		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
@@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = P;
-	sd->N = Ng;
-	sd->Ng = Ng;
-	sd->I = I;
-	sd->shader = shader;
-	sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	ccl_fetch(sd, P) = P;
+	ccl_fetch(sd, N) = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, I) = I;
+	ccl_fetch(sd, shader) = shader;
+	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	sd->object = object;
+	ccl_fetch(sd, object) = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	sd->prim = prim;
+	ccl_fetch(sd, prim) = prim;
 #ifdef __UV__
-	sd->u = u;
-	sd->v = v;
+	ccl_fetch(sd, u) = u;
+	ccl_fetch(sd, v) = v;
 #endif
-	sd->ray_length = t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 	/* detect instancing, for non-instanced the object index is -object-1 */
 #ifdef __INSTANCING__
 	bool instanced = false;
 
-	if(sd->prim != PRIM_NONE) {
-		if(sd->object >= 0)
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		if(ccl_fetch(sd, object) >= 0)
 			instanced = true;
 		else
 #endif
-			sd->object = ~sd->object;
+			ccl_fetch(sd, object) = ~ccl_fetch(sd, object);
 #ifdef __INSTANCING__
 	}
 #endif
 
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
-	if(sd->object != OBJECT_NONE) {
-		sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
 	}
 
-	sd->time = time;
+	ccl_fetch(sd, time) = time;
 #else
 	}
 #endif
 
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __INSTANCING__
 			if(instanced)
-				object_normal_transform(kg, sd, &sd->N);
+				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 
 #ifdef __INSTANCING__
 		if(instanced) {
-			object_dir_transform(kg, sd, &sd->dPdu);
-			object_dir_transform(kg, sd, &sd->dPdv);
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 		}
 #endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(sd->prim != PRIM_NONE) {
-		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 		if(backfacing) {
-			sd->flag |= SD_BACKFACING;
-			sd->Ng = -sd->Ng;
-			sd->N = -sd->N;
+			ccl_fetch(sd, flag) |= SD_BACKFACING;
+			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-			sd->dPdu = -sd->dPdu;
-			sd->dPdv = -sd->dPdv;
+			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	sd->dP = differential3_zero();
-	sd->dI = differential3_zero();
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = differential3_zero();
+	ccl_fetch(sd, dI) = differential3_zero();
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
@@ -355,47 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = ray->D;
-	sd->N = -ray->D;
-	sd->Ng = -ray->D;
-	sd->I = -ray->D;
-	sd->shader = kernel_data.background.surface_shader;
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, P) = ray->D;
+	ccl_fetch(sd, N) = -ray->D;
+	ccl_fetch(sd, Ng) = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
+	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 #ifdef __OBJECT_MOTION__
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
-	sd->ray_length = 0.0f;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = 0.0f;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __INSTANCING__
-	sd->object = PRIM_NONE;
+	ccl_fetch(sd, object) = PRIM_NONE;
 #endif
-	sd->prim = PRIM_NONE;
+	ccl_fetch(sd, prim) = PRIM_NONE;
 #ifdef __UV__
-	sd->u = 0.0f;
-	sd->v = 0.0f;
+	ccl_fetch(sd, u) = 0.0f;
+	ccl_fetch(sd, v) = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	sd->dP = ray->dD;
-	differential_incoming(&sd->dI, sd->dP);
-	sd->du.dx = 0.0f;
-	sd->du.dy = 0.0f;
-	sd->dv.dx = 0.0f;
-	sd->dv.dy = 0.0f;
+	ccl_fetch(sd, dP) = ray->dD;
+	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
 /* ShaderData setup from point inside volume */
 
+#ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
@@ -441,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
 }
+#endif
 
 /* Merging */
 
@@ -459,7 +459,7 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 				continue;
 #endif
 
-			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1))
+			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1 && sci->data2 == scj->data2))
 				continue;
 
 			if(CLOSURE_IS_BSDF_OR_BSSRDF(sci->type)) {
@@ -480,6 +480,7 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 			}
 
 			sd->num_closure--;
+			kernel_assert(sd->num_closure >= 0);
 			j--;
 		}
 	}
@@ -493,11 +494,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = &sd->closure[i];
+		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
@@ -515,7 +516,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
 }
 
-ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
@@ -529,22 +530,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 {
 	int sampled = 0;
 
-	if(sd->num_closure > 1) {
+	if(ccl_fetch(sd, num_closure) > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
+		float r = ccl_fetch(sd, randb_closure)*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -554,13 +555,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 			}
 		}
 
-		if(sampled == sd->num_closure) {
+		if(sampled == ccl_fetch(sd, num_closure)) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = &sd->closure[sampled];
+	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+
 	int label;
 	float3 eval;
 
@@ -570,7 +572,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(sd->num_closure > 1) {
+		if(ccl_fetch(sd, num_closure) > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -597,8 +599,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -607,13 +609,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(sd->flag & SD_HAS_ONLY_VOLUME)
+	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -636,8 +638,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -650,8 +652,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -664,8 +666,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -678,10 +680,10 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
-		if(CLOSURE_IS_BSSRDF(sc->type))
+		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
 	}
 
@@ -693,8 +695,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			eval += sc->weight*ao_factor;
@@ -702,12 +704,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += sd->N*average(sc->weight);
+			N += ccl_fetch(sd, N)*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = sd->N;
+		N = ccl_fetch(sd, N);
 	else
 		N = normalize(N);
 
@@ -721,8 +723,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			float avg_weight = fabsf(average(sc->weight));
@@ -735,7 +737,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? sd->N: normalize(N);
+		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
 
 	if(texture_blur_)
 		*texture_blur_ = texture_blur/weight_sum;
@@ -747,7 +749,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(sd->Ng, sd->I);
+	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -755,8 +757,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -771,8 +773,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -786,8 +788,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = randb;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -798,9 +800,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
-		sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
-		sd->closure->N = sd->N;
-		sd->flag |= bsdf_diffuse_setup(&sd->closure);
+		ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
+		ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
+		ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
+		ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
+		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
 #endif
 	}
 }
@@ -809,8 +813,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 #ifdef __OSL__
 	if(kg->osl) {
@@ -825,8 +829,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 
 		float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-		for(int i = 0; i< sd->num_closure; i++) {
-			const ShaderClosure *sc = &sd->closure[i];
+		for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 			if(CLOSURE_IS_BACKGROUND(sc->type))
 				eval += sc->weight;
@@ -846,7 +850,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, const float3 omega_in, float *pdf,
 	int skip_phase, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_phase)
 			continue;
 
@@ -999,8 +1003,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h
new file mode 100644
index 00000000000..b157b82e023
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shaderdata_vars.h
@@ -0,0 +1,99 @@
+/*
+* Copyright 2011-2015 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SD_VAR
+#define SD_VAR(type, what)
+#endif
+#ifndef SD_CLOSURE_VAR
+#define SD_CLOSURE_VAR(type, what, max_closure)
+#endif
+
+/* position */
+SD_VAR(float3, P)
+/* smooth normal for shading */
+SD_VAR(float3, N)
+/* true geometric normal */
+SD_VAR(float3, Ng)
+/* view/incoming direction */
+SD_VAR(float3, I)
+/* shader id */
+SD_VAR(int, shader)
+/* booleans describing shader, see ShaderDataFlag */
+SD_VAR(int, flag)
+
+/* primitive id if there is one, ~0 otherwise */
+SD_VAR(int, prim)
+
+/* combined type and curve segment for hair */
+SD_VAR(int, type)
+
+/* parametric coordinates
+* - barycentric weights for triangles */
+SD_VAR(float, u)
+SD_VAR(float, v)
+/* object id if there is one, ~0 otherwise */
+SD_VAR(int, object)
+
+/* motion blur sample time */
+SD_VAR(float, time)
+
+/* length of the ray being shaded */
+SD_VAR(float, ray_length)
+
+/* ray bounce depth */
+SD_VAR(int, ray_depth)
+
+/* ray transparent depth */
+SD_VAR(int, transparent_depth)
+
+#ifdef __RAY_DIFFERENTIALS__
+/* differential of P. these are orthogonal to Ng, not N */
+SD_VAR(differential3, dP)
+/* differential of I */
+SD_VAR(differential3, dI)
+/* differential of u, v */
+SD_VAR(differential, du)
+SD_VAR(differential, dv)
+#endif
+#ifdef __DPDU__
+/* differential of P w.r.t. parametric coordinates. note that dPdu is
+* not readily suitable as a tangent for shading on triangles. */
+SD_VAR(float3, dPdu)
+SD_VAR(float3, dPdv)
+#endif
+
+#ifdef __OBJECT_MOTION__
+/* object <-> world space transformations, cached to avoid
+* re-interpolating them constantly for shading */
+SD_VAR(Transform, ob_tfm)
+SD_VAR(Transform, ob_itfm)
+#endif
+
+/* Closure data, we store a fixed array of closures */
+SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE)
+SD_VAR(int, num_closure)
+SD_VAR(float, randb_closure)
+
+/* ray start position, only set for backgrounds */
+SD_VAR(float3, ray_P)
+SD_VAR(differential3, ray_dP)
+
+#ifdef __OSL__
+SD_VAR(struct KernelGlobals *, osl_globals)
+#endif
+
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 61954282c28..2811a8348ca 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -39,19 +39,6 @@ CCL_NAMESPACE_BEGIN
  * This is CPU only because of qsort, and malloc or high stack space usage to
  * record all these intersections. */
 
-ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
-{
-	const Intersection *isect_a = (const Intersection*)a;
-	const Intersection *isect_b = (const Intersection*)b;
-
-	if(isect_a->t < isect_b->t)
-		return -1;
-	else if(isect_a->t > isect_b->t)
-		return 1;
-	else
-		return 0;
-}
-
 #define STACK_MAX_HITS 64
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
@@ -95,7 +82,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 			PathState ps = *state;
 #endif
 
-			qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
+			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 
 			for(int hit = 0; hit < num_hits; hit++, isect++) {
 				/* adjust intersection distance for moving ray forward */
@@ -193,19 +180,36 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
  * potentially transparent, and only in that case start marching. this gives
  * one extra ray cast for the cases were we do want transparency. */
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow
+#ifdef __SPLIT_KERNEL__
+                                      , ShaderData *sd_mem, Intersection *isect_mem
+#endif
+                                      )
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 
-	if(ray->t == 0.0f)
+	if(ray_input->t == 0.0f)
 		return false;
 
-	Intersection isect;
-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#ifdef __SPLIT_KERNEL__
+	Ray private_ray = *ray_input;
+	Ray *ray = &private_ray;
+#else
+	Ray *ray = ray_input;
+#endif
+
+#ifdef __SPLIT_KERNEL__
+	Intersection *isect = isect_mem;
+#else
+	Intersection isect_object;
+	Intersection *isect = &isect_object;
+#endif
+
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
 
 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, &isect)) {
+		if(shader_transparent_shadow(kg, isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
 			int bounce = state->transparent_bounce;
@@ -217,9 +221,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
 
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
-
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
 					if(ps.volume_stack[0].shader != SHADER_NONE)
@@ -231,39 +234,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					return false;
 				}
 
-				if(!shader_transparent_shadow(kg, &isect))
+				if(!shader_transparent_shadow(kg, isect))
 					return true;
 
 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
 				if(ps.volume_stack[0].shader != SHADER_NONE) {
 					Ray segment_ray = *ray;
-					segment_ray.t = isect.t;
+					segment_ray.t = isect->t;
 					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
 				}
 #endif
 
 				/* setup shader data at surface */
-				ShaderData sd;
-				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
+#ifdef __SPLIT_KERNEL__
+				ShaderData *sd = sd_mem;
+#else
+				ShaderData sd_object;
+				ShaderData *sd = &sd_object;
+#endif
+				shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce);
 
 				/* attenuation from transparent surface */
-				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					throughput *= shader_bsdf_transparency(kg, &sd);
+				if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, sd);
 				}
 
 				if(is_zero(throughput))
 					return true;
 
 				/* move ray forward */
-				ray->P = ray_offset(sd.P, -sd.Ng);
+				ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 				if(ray->t != FLT_MAX)
 					ray->D = normalize_len(Pend - ray->P, &ray->t);
 
 #ifdef __VOLUME__
 				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
 #endif
 
 				bounce++;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index fb927e81f22..2da060c32a2 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index ef46b2f707f..f545a056cc8 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef KERNEL_TEX
@@ -24,6 +24,7 @@
 
 /* bvh */
 KERNEL_TEX(float4, texture_float4, __bvh_nodes)
+KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
 KERNEL_TEX(float4, texture_float4, __tri_woop)
 KERNEL_TEX(uint, texture_uint, __prim_type)
 KERNEL_TEX(uint, texture_uint, __prim_visibility)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 0ec34dae87a..60973a71d20 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_TYPES_H__
@@ -24,12 +24,19 @@
 #define __KERNEL_CPU__
 #endif
 
+/* TODO(sergey): This is only to make it possible to include this header
+ * from outside of the kernel. but this could be done somewhat cleaner?
+ */
+#ifndef ccl_addr_space
+#define ccl_addr_space
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* constants */
 #define OBJECT_SIZE 		11
 #define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE			4
+#define LIGHT_SIZE			5
 #define FILTER_TABLE_SIZE	256
 #define RAMP_TABLE_SIZE		256
 #define PARTICLE_SIZE 		5
@@ -38,12 +45,6 @@ CCL_NAMESPACE_BEGIN
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
 
-#define BB_DRAPPER				800.0f
-#define BB_MAX_TABLE_RANGE		12000.0f
-#define BB_TABLE_XPOWER			1.5f
-#define BB_TABLE_YPOWER			5.0f
-#define BB_TABLE_SPACING		2.0f
-
 #define BECKMANN_TABLE_SIZE		256
 
 #define TEX_NUM_FLOAT_IMAGES	5
@@ -57,6 +58,9 @@ CCL_NAMESPACE_BEGIN
 
 /* device capabilities */
 #ifdef __KERNEL_CPU__
+#ifdef __KERNEL_SSE2__
+#  define __QBVH__
+#endif
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
@@ -69,6 +73,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_DECOUPLED__
 #define __VOLUME_SCATTER__
 #define __SHADOW_RECORD_ALL__
+#define __VOLUME_RECORD_ALL__
 #define __CAMERA_RAY_NODES__
 #endif
 
@@ -80,7 +85,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#ifdef __KERNEL_EXPERIMENTAL__
 #define __SUBSURFACE__
 #define __CMJ__
 #endif
@@ -92,38 +97,51 @@ CCL_NAMESPACE_BEGIN
 /* keep __KERNEL_ADV_SHADING__ in sync with opencl_kernel_use_advanced_shading! */
 
 #ifdef __KERNEL_OPENCL_NVIDIA__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_APPLE__
-#define __KERNEL_SHADING__
-//#define __KERNEL_ADV_SHADING__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Currently experimental section is ignored here,
+ * this is because megakernel in device_opencl does not support
+ * custom cflags depending on the scene features.
+ */
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_AMD__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-//__KERNEL_ADV_SHADING__
-#define __MULTI_CLOSURE__
-#define __TRANSPARENT_SHADOWS__
-#define __PASSES__
-#define __BACKGROUND_MIS__
-#define __LAMP_MIS__
-#define __AO__
-//#define __CAMERA_MOTION__
-//#define __OBJECT_MOTION__
-//#define __HAIR__
-//end __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __MULTI_CLOSURE__
+#  define __PASSES__
+#  define __BACKGROUND_MIS__
+#  define __LAMP_MIS__
+#  define __AO__
+#  define __CAMERA_MOTION__
+#  define __OBJECT_MOTION__
+#  define __HAIR__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __TRANSPARENT_SHADOWS__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
-#endif
+#endif // __KERNEL_OPENCL__
 
 /* kernel features */
 #define __SOBOL__
@@ -158,6 +176,21 @@ CCL_NAMESPACE_BEGIN
 #define __HAIR__
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+#  define __KERNEL_DEBUG__
+#endif
+
+/* Scene-based selective featrues compilation/ */
+#ifdef __NO_CAMERA_MOTION__
+#  undef __CAMERA_MOTION__
+#endif
+#ifdef __NO_OBJECT_MOTION__
+#  undef __OBJECT_MOTION__
+#endif
+#ifdef __NO_HAIR__
+#  undef __HAIR__
+#endif
+
 /* Random Numbers */
 
 typedef uint RNG;
@@ -263,9 +296,7 @@ enum PathRayFlag {
 
 	PATH_RAY_MIS_SKIP = 2048,
 	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
-	PATH_RAY_GLOSSY_ANCESTOR = 8192,
-	PATH_RAY_BSSRDF_ANCESTOR = 16384,
-	PATH_RAY_SINGLE_PASS_DONE = 32768,
+	PATH_RAY_SINGLE_PASS_DONE = 8192,
 
 	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
@@ -288,39 +319,44 @@ typedef enum ClosureLabel {
 
 typedef enum PassType {
 	PASS_NONE = 0,
-	PASS_COMBINED = 1,
-	PASS_DEPTH = 2,
-	PASS_NORMAL = 4,
-	PASS_UV = 8,
-	PASS_OBJECT_ID = 16,
-	PASS_MATERIAL_ID = 32,
-	PASS_DIFFUSE_COLOR = 64,
-	PASS_GLOSSY_COLOR = 128,
-	PASS_TRANSMISSION_COLOR = 256,
-	PASS_DIFFUSE_INDIRECT = 512,
-	PASS_GLOSSY_INDIRECT = 1024,
-	PASS_TRANSMISSION_INDIRECT = 2048,
-	PASS_DIFFUSE_DIRECT = 4096,
-	PASS_GLOSSY_DIRECT = 8192,
-	PASS_TRANSMISSION_DIRECT = 16384,
-	PASS_EMISSION = 32768,
-	PASS_BACKGROUND = 65536,
-	PASS_AO = 131072,
-	PASS_SHADOW = 262144,
-	PASS_MOTION = 524288,
-	PASS_MOTION_WEIGHT = 1048576,
-	PASS_MIST = 2097152,
-	PASS_SUBSURFACE_DIRECT = 4194304,
-	PASS_SUBSURFACE_INDIRECT = 8388608,
-	PASS_SUBSURFACE_COLOR = 16777216,
-	PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
+	PASS_COMBINED = (1 << 0),
+	PASS_DEPTH = (1 << 1),
+	PASS_NORMAL = (1 << 2),
+	PASS_UV = (1 << 3),
+	PASS_OBJECT_ID = (1 << 4),
+	PASS_MATERIAL_ID = (1 << 5),
+	PASS_DIFFUSE_COLOR = (1 << 6),
+	PASS_GLOSSY_COLOR = (1 << 7),
+	PASS_TRANSMISSION_COLOR = (1 << 8),
+	PASS_DIFFUSE_INDIRECT = (1 << 9),
+	PASS_GLOSSY_INDIRECT = (1 << 10),
+	PASS_TRANSMISSION_INDIRECT = (1 << 11),
+	PASS_DIFFUSE_DIRECT = (1 << 12),
+	PASS_GLOSSY_DIRECT = (1 << 13),
+	PASS_TRANSMISSION_DIRECT = (1 << 14),
+	PASS_EMISSION = (1 << 15),
+	PASS_BACKGROUND = (1 << 16),
+	PASS_AO = (1 << 17),
+	PASS_SHADOW = (1 << 18),
+	PASS_MOTION = (1 << 19),
+	PASS_MOTION_WEIGHT = (1 << 20),
+	PASS_MIST = (1 << 21),
+	PASS_SUBSURFACE_DIRECT = (1 << 22),
+	PASS_SUBSURFACE_INDIRECT = (1 << 23),
+	PASS_SUBSURFACE_COLOR = (1 << 24),
+	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
+#ifdef __KERNEL_DEBUG__
+	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
+	PASS_RAY_BOUNCES = (1 << 28),
+#endif
 } PassType;
 
 #define PASS_ALL (~0)
 
 #ifdef __PASSES__
 
-typedef struct PathRadiance {
+typedef ccl_addr_space struct PathRadiance {
 	int use_light_pass;
 
 	float3 emission;
@@ -372,7 +408,7 @@ typedef struct BsdfEval {
 
 #else
 
-typedef float3 PathRadiance;
+typedef ccl_addr_space float3 PathRadiance;
 typedef float3 BsdfEval;
 
 #endif
@@ -417,6 +453,7 @@ enum CameraType {
 
 enum PanoramaType {
 	PANORAMA_EQUIRECTANGULAR,
+	PANORAMA_MIRRORBALL,
 	PANORAMA_FISHEYE_EQUIDISTANT,
 	PANORAMA_FISHEYE_EQUISOLID
 };
@@ -436,10 +473,26 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
+/* TODO(sergey): This is only needed because current AMD
+ * compiler has hard time building the kernel with this
+ * reshuffle. And at the same time reshuffle will cause
+ * less optimal CPU code in certain places.
+ *
+ * We'll get rid of this nasty exception once AMD compiler
+ * is fixed.
+ */
+#ifndef __KERNEL_OPENCL_AMD__
 	float3 P;		/* origin */
 	float3 D;		/* direction */
+
+	float t;		/* length of the ray */
+	float time;		/* time (for motion blur) */
+#else
 	float t;		/* length of the ray */
 	float time;		/* time (for motion blur) */
+	float3 P;		/* origin */
+	float3 D;		/* direction */
+#endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	differential3 dP;
@@ -449,11 +502,16 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef struct Intersection {
+typedef ccl_addr_space struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
 	int type;
+
+#ifdef __KERNEL_DEBUG__
+	int num_traversal_steps;
+	int num_traversed_instances;
+#endif
 } Intersection;
 
 /* Primitives */
@@ -468,7 +526,12 @@ typedef enum PrimitiveType {
 	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
 	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
 	PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE|PRIMITIVE_MOTION_CURVE),
-	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE)
+	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE),
+
+	/* Total number of different primitives.
+	 * NOTE: This is an actual value, not a bitflag.
+	 */
+	PRIMITIVE_NUM_TOTAL = 4,
 } PrimitiveType;
 
 #define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type)
@@ -516,6 +579,7 @@ typedef enum AttributeStandard {
 	ATTR_STD_VOLUME_FLAME,
 	ATTR_STD_VOLUME_HEAT,
 	ATTR_STD_VOLUME_VELOCITY,
+	ATTR_STD_POINTINESS,
 	ATTR_STD_NUM,
 
 	ATTR_STD_NOT_FOUND = ~0
@@ -524,39 +588,34 @@ typedef enum AttributeStandard {
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
-#define MAX_CLOSURE 64
+#  ifndef __MAX_CLOSURE__
+#     define MAX_CLOSURE 64
+#  else
+#    define MAX_CLOSURE __MAX_CLOSURE__
+#  endif
 #else
 #define MAX_CLOSURE 1
 #endif
 
-/* TODO(sergey): This is rather nasty bug happening in here, which
- * could be simply a compilers bug for which we can't find a generic
- * platform independent workaround. Also even if it's a compiler
- * issue, it's not so simple to upgrade the compiler in the release
- * environment for linux and doing it so closer to the release is
- * rather a risky business.
- *
- * For this release it's probably safer to stick with such a rather
- * dirty solution, and look for a cleaner fix during the next release
- * cycle.
+/* This struct is to be 16 bytes aligned, we also keep some extra precautions:
+ * - All the float3 members are in the beginning of the struct, so compiler
+ *   does not put own padding trying to align this members.
+ * - We make sure OSL pointer is also 16 bytes aligned.
  */
-typedef struct ShaderClosure {
-	ClosureType type;
+typedef ccl_addr_space struct ShaderClosure {
 	float3 weight;
-#ifndef __APPLE__
+	float3 N;
+	float3 T;
+
+	ClosureType type;
 	float sample_weight;
-#endif
 	float data0;
 	float data1;
 	float data2;
+	int pad1, pad2, pad3;
 
-	float3 N;
-	float3 T;
-#ifdef __APPLE__
-	float sample_weight;
-#endif
 #ifdef __OSL__
-	void *prim;
+	void *prim, *pad4;
 #endif
 } ShaderClosure;
 
@@ -581,119 +640,70 @@ typedef enum ShaderContext {
 
 enum ShaderDataFlag {
 	/* runtime flags */
-	SD_BACKFACING = 1,		/* backside of surface? */
-	SD_EMISSION = 2,		/* have emissive closure? */
-	SD_BSDF = 4,			/* have bsdf closure? */
-	SD_BSDF_HAS_EVAL = 8,	/* have non-singular bsdf closure? */
-	SD_PHASE_HAS_EVAL = 8,	/* have non-singular phase closure? */
-	SD_BSDF_GLOSSY = 16,	/* have glossy bsdf */
-	SD_BSSRDF = 32,			/* have bssrdf */
-	SD_HOLDOUT = 64,		/* have holdout closure? */
-	SD_ABSORPTION = 128,	/* have volume absorption closure? */
-	SD_SCATTER = 256,		/* have volume phase closure? */
-	SD_AO = 512,			/* have ao closure? */
-	SD_TRANSPARENT = 1024,	/* have transparent closure? */
-
-	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|
-	                    SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
+	SD_BACKFACING     = (1 << 0),   /* backside of surface? */
+	SD_EMISSION       = (1 << 1),   /* have emissive closure? */
+	SD_BSDF           = (1 << 2),   /* have bsdf closure? */
+	SD_BSDF_HAS_EVAL  = (1 << 3),   /* have non-singular bsdf closure? */
+	SD_BSSRDF         = (1 << 4),   /* have bssrdf */
+	SD_HOLDOUT        = (1 << 5),   /* have holdout closure? */
+	SD_ABSORPTION     = (1 << 6),   /* have volume absorption closure? */
+	SD_SCATTER        = (1 << 7),   /* have volume phase closure? */
+	SD_AO             = (1 << 8),   /* have ao closure? */
+	SD_TRANSPARENT    = (1 << 9),  /* have transparent closure? */
+
+	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF|
+	                    SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
 
 	/* shader flags */
-	SD_USE_MIS = 2048,					/* direct light sample */
-	SD_HAS_TRANSPARENT_SHADOW = 4096,	/* has transparent shadow */
-	SD_HAS_VOLUME = 8192,				/* has volume shader */
-	SD_HAS_ONLY_VOLUME = 16384,			/* has only volume shader, no surface */
-	SD_HETEROGENEOUS_VOLUME = 32768,	/* has heterogeneous volume */
-	SD_HAS_BSSRDF_BUMP = 65536,			/* bssrdf normal uses bump */
-	SD_VOLUME_EQUIANGULAR = 131072,		/* use equiangular sampling */
-	SD_VOLUME_MIS = 262144,				/* use multiple importance sampling */
+	SD_USE_MIS                = (1 << 10),  /* direct light sample */
+	SD_HAS_TRANSPARENT_SHADOW = (1 << 11),  /* has transparent shadow */
+	SD_HAS_VOLUME             = (1 << 12),  /* has volume shader */
+	SD_HAS_ONLY_VOLUME        = (1 << 13),  /* has only volume shader, no surface */
+	SD_HETEROGENEOUS_VOLUME   = (1 << 14),  /* has heterogeneous volume */
+	SD_HAS_BSSRDF_BUMP        = (1 << 15),  /* bssrdf normal uses bump */
+	SD_VOLUME_EQUIANGULAR     = (1 << 16),  /* use equiangular sampling */
+	SD_VOLUME_MIS             = (1 << 17),  /* use multiple importance sampling */
+	SD_VOLUME_CUBIC           = (1 << 18),  /* use cubic interpolation for voxels */
+	SD_HAS_BUMP               = (1 << 19),  /* has data connected to the displacement input */
 
 	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
 	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
-					   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS),
+	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
+	                   SD_VOLUME_CUBIC|SD_HAS_BUMP),
 
 	/* object flags */
-	SD_HOLDOUT_MASK = 524288,			/* holdout for camera rays */
-	SD_OBJECT_MOTION = 1048576,			/* has object motion blur */
-	SD_TRANSFORM_APPLIED = 2097152,		/* vertices have transform applied */
-	SD_NEGATIVE_SCALE_APPLIED = 4194304,	/* vertices have negative scale applied */
-
-	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED)
+	SD_HOLDOUT_MASK             = (1 << 20),  /* holdout for camera rays */
+	SD_OBJECT_MOTION            = (1 << 21),  /* has object motion blur */
+	SD_TRANSFORM_APPLIED        = (1 << 22),  /* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED   = (1 << 23),  /* vertices have negative scale applied */
+	SD_OBJECT_HAS_VOLUME        = (1 << 24),  /* object has a volume shader */
+	SD_OBJECT_INTERSECTS_VOLUME = (1 << 25),  /* object intersects AABB of an object with volume shader */
+	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 26),  /* has position for motion vertices */
+
+	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
+	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
+	                   SD_OBJECT_INTERSECTS_VOLUME)
 };
 
 struct KernelGlobals;
 
-typedef struct ShaderData {
-	/* position */
-	float3 P;
-	/* smooth normal for shading */
-	float3 N;
-	/* true geometric normal */
-	float3 Ng;
-	/* view/incoming direction */
-	float3 I;
-	/* shader id */
-	int shader;
-	/* booleans describing shader, see ShaderDataFlag */
-	int flag;
-
-	/* primitive id if there is one, ~0 otherwise */
-	int prim;
-
-	/* combined type and curve segment for hair */
-	int type;
-
-	/* parametric coordinates
-	 * - barycentric weights for triangles */
-	float u, v;
-	/* object id if there is one, ~0 otherwise */
-	int object;
-
-	/* motion blur sample time */
-	float time;
-	
-	/* length of the ray being shaded */
-	float ray_length;
-	
-	/* ray bounce depth */
-	int ray_depth;
-
-	/* ray transparent depth */
-	int transparent_depth;
-
-#ifdef __RAY_DIFFERENTIALS__
-	/* differential of P. these are orthogonal to Ng, not N */
-	differential3 dP;
-	/* differential of I */
-	differential3 dI;
-	/* differential of u, v */
-	differential du;
-	differential dv;
-#endif
-#ifdef __DPDU__
-	/* differential of P w.r.t. parametric coordinates. note that dPdu is
-	 * not readily suitable as a tangent for shading on triangles. */
-	float3 dPdu, dPdv;
-#endif
-
-#ifdef __OBJECT_MOTION__
-	/* object <-> world space transformations, cached to avoid
-	 * re-interpolating them constantly for shading */
-	Transform ob_tfm;
-	Transform ob_itfm;
+#ifdef __SPLIT_KERNEL__
+#define SD_VAR(type, what) ccl_global type *what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type *what;
+#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0))
+#define ccl_fetch(s, t) (s->t[TIDX])
+#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index])
+#else
+#define SD_VAR(type, what) type what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure];
+#define ccl_fetch(s, t) (s->t)
+#define ccl_fetch_array(s, t, index) (&s->t[index])
 #endif
 
-	/* Closure data, we store a fixed array of closures */
-	ShaderClosure closure[MAX_CLOSURE];
-	int num_closure;
-	float randb_closure;
+typedef ccl_addr_space struct ShaderData {
 
-	/* ray start position, only set for backgrounds */
-	float3 ray_P;
-	differential3 ray_dP;
+#include "kernel_shaderdata_vars.h"
 
-#ifdef __OSL__
-	struct KernelGlobals *osl_globals;
-#endif
 } ShaderData;
 
 /* Path State */
@@ -711,7 +721,6 @@ typedef struct PathState {
 
 	/* random number generator state */
 	int rng_offset;    		/* dimension offset */
-	int rng_offset_bsdf;  	/* dimension offset for picking bsdf */
 	int sample;        		/* path sample number */
 	int num_samples;		/* total number of times this path will be sampled */
 
@@ -751,6 +760,7 @@ typedef struct KernelCamera {
 	int panorama_type;
 	float fisheye_fov;
 	float fisheye_lens;
+	float4 equirectangular_range;
 
 	/* matrices */
 	Transform cameratoworld;
@@ -768,7 +778,7 @@ typedef struct KernelCamera {
 
 	/* motion blur */
 	float shuttertime;
-	int have_motion;
+	int have_motion, have_perspective_motion;
 
 	/* clipping */
 	float nearclip;
@@ -789,7 +799,7 @@ typedef struct KernelCamera {
 	int shader;
 
 	float focal_length;
-	float pad[3];
+	float pad[2];
 
 	/* more matrices */
 	Transform screentoworld;
@@ -803,6 +813,11 @@ typedef struct KernelCamera {
 	Transform worldtocamera;
 
 	MotionTransform motion;
+
+	/* Denotes changes in the projective matrix, namely in rastertocamera.
+	 * Used for camera zoom motion blur,
+	 */
+	PerspectiveMotionTransform perspective_motion;
 } KernelCamera;
 
 typedef struct KernelFilm {
@@ -850,6 +865,13 @@ typedef struct KernelFilm {
 	float mist_start;
 	float mist_inv_depth;
 	float mist_falloff;
+
+#ifdef __KERNEL_DEBUG__
+	int pass_bvh_traversal_steps;
+	int pass_bvh_traversed_instances;
+	int pass_ray_bounces;
+	int pass_pad3;
+#endif
 } KernelFilm;
 
 typedef struct KernelBackground {
@@ -876,6 +898,11 @@ typedef struct KernelIntegrator {
 	float inv_pdf_lights;
 	int pdf_background_res;
 
+	/* light portals */
+	float portal_pdf;
+	int num_portals;
+	int portal_offset;
+
 	/* bounces */
 	int min_bounce;
 	int max_bounce;
@@ -928,6 +955,8 @@ typedef struct KernelIntegrator {
 	int volume_max_steps;
 	float volume_step_size;
 	int volume_samples;
+
+	int pad;
 } KernelIntegrator;
 
 typedef struct KernelBVH {
@@ -937,8 +966,8 @@ typedef struct KernelBVH {
 	int have_motion;
 	int have_curves;
 	int have_instancing;
-
-	int pad1, pad2, pad3;
+	int use_qbvh;
+	int pad1, pad2;
 } KernelBVH;
 
 typedef enum CurveFlag {
@@ -961,9 +990,8 @@ typedef struct KernelCurves {
 } KernelCurves;
 
 typedef struct KernelTables {
-	int blackbody_offset;
 	int beckmann_offset;
-	int pad1, pad2;
+	int pad1, pad2, pad3;
 } KernelTables;
 
 typedef struct KernelData {
@@ -983,6 +1011,68 @@ typedef struct CameraData {
 	int shader;
 } CameraData;
 
+#ifdef __KERNEL_DEBUG__
+typedef ccl_addr_space struct DebugData {
+	// Total number of BVH node traversal steps and primitives intersections
+	// for the camera rays.
+	int num_bvh_traversal_steps;
+	int num_bvh_traversed_instances;
+	int num_ray_bounces;
+} DebugData;
+#endif
+
+/* Declarations required for split kernel */
+
+/* Macro for queues */
+/* Value marking queue's empty slot */
+#define QUEUE_EMPTY_SLOT -1
+
+/*
+* Queue 1 - Active rays
+* Queue 2 - Background queue
+* Queue 3 - Shadow ray cast kernel - AO
+* Queeu 4 - Shadow ray cast kernel - direct lighting
+*/
+#define NUM_QUEUES 4
+
+/* Queue names */
+enum QueueNumber {
+	QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,     /* All active rays and regenerated rays are enqueued here. */
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,  /* All
+	                                            * 1. Background-hit rays,
+	                                            * 2. Rays that has exited path-iteration but needs to update output buffer
+	                                            * 3. Rays to be regenerated
+	                                            * are enqueued here.
+	                                            */
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,         /* All rays for which a shadow ray should be cast to determine radiance
+	                                            * contribution for AO are enqueued here.
+	                                            */
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,         /* All rays for which a shadow ray should be cast to determine radiance
+	                                            * contributing for direct lighting are enqueued here.
+	                                            */
+};
+
+/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
+#define RAY_STATE_MASK 0x007
+#define RAY_FLAG_MASK 0x0F8
+enum RayState {
+	RAY_ACTIVE = 0,             // Denotes ray is actively involved in path-iteration
+	RAY_INACTIVE = 1,           // Denotes ray has completed processing all samples and is inactive
+	RAY_UPDATE_BUFFER = 2,      // Denoted ray has exited path-iteration and needs to update output buffer
+	RAY_HIT_BACKGROUND = 3,     // Donotes ray has hit background
+	RAY_TO_REGENERATE = 4,      // Denotes ray has to be regenerated
+	RAY_REGENERATED = 5,        // Denotes ray has been regenerated
+	RAY_SKIP_DL = 6,            // Denotes ray should skip direct lighting
+	RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part
+	RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part
+};
+
+#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
+#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
+#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
+#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 1273869ca28..0a74a9deba9 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -374,7 +374,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			/* distance sampling */
 			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 
-			/* modifiy pdf for hit/miss decision */
+			/* modify pdf for hit/miss decision */
 			if(probalistic_scatter)
 				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 
@@ -422,7 +422,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 
 /* heterogeneous volume distance sampling: integrate stepping through the
  * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probalistically scatter or get transmitted through
+ * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
@@ -578,10 +578,11 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then latter be used for decoupled sampling as in:
+ * through a volume. This can then later be used for decoupled sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media"
  *
- * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * On the GPU this is only supported (but currently not enabled)
+ * for homogeneous volumes (1 step), due to
  * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
@@ -595,6 +596,7 @@ typedef struct VolumeStep {
 } VolumeStep;
 
 typedef struct VolumeSegment {
+	VolumeStep stack_step;      /* stack storage for homogeneous step, to avoid malloc */
 	VolumeStep *steps;			/* recorded steps */
 	int numsteps;				/* number of steps */
 	int closure_flag;			/* accumulated closure flags from all steps */
@@ -608,7 +610,7 @@ typedef struct VolumeSegment {
 /* record volume steps to the end of the volume.
  *
  * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probalistically
+ * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
@@ -621,17 +623,22 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	float step_size, random_jitter_offset;
 
 	if(heterogeneous) {
-		max_steps = kernel_data.integrator.volume_max_steps;
+		const int global_max_steps = kernel_data.integrator.volume_max_steps;
 		step_size = kernel_data.integrator.volume_step_size;
-		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
-
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
+		if(max_steps > global_max_steps) {
+			max_steps = global_max_steps;
+			step_size = ray->t / (float)max_steps;
+		}
+		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
 	}
 	else {
 		max_steps = 1;
 		step_size = ray->t;
 		random_jitter_offset = 0.0f;
+		segment->steps = &segment->stack_step;
 	}
 	
 	/* init accumulation variables */
@@ -640,10 +647,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 	float t = 0.0f;
 
-	segment->closure_flag = 0;
 	segment->numsteps = 0;
-
-	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+	segment->closure_flag = 0;
+	bool is_last_step_empty = false;
 
 	VolumeStep *step = segment->steps;
 
@@ -685,12 +691,24 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 			step->closure_flag = closure_flag;
 
 			segment->closure_flag |= closure_flag;
+
+			is_last_step_empty = false;
+			segment->numsteps++;
 		}
 		else {
-			/* store empty step (todo: skip consecutive empty steps) */
-			step->sigma_t = make_float3(0.0f, 0.0f, 0.0f);
-			step->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-			step->closure_flag = 0;
+			if(is_last_step_empty) {
+				/* consecutive empty step, merge */
+				step--;
+			}
+			else {
+				/* store empty step */
+				step->sigma_t = make_float3(0.0f, 0.0f, 0.0f);
+				step->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+				step->closure_flag = 0;
+
+				segment->numsteps++;
+				is_last_step_empty = true;
+			}
 		}
 
 		step->accum_transmittance = accum_transmittance;
@@ -698,8 +716,6 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step->t = new_t;
 		step->shade_t = t + random_jitter_offset;
 
-		segment->numsteps++;
-
 		/* stop if at the end of the volume */
 		t = new_t;
 		if(t == ray->t)
@@ -729,16 +745,13 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
-	free(segment->steps);
+	if(segment->steps != &segment->stack_step)
+		free(segment->steps);
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching. unlike the non-decoupled functions, these do not do probalistic
- * scattering, they always scatter if there is any non-zero scattering
- * coefficient.
+ * marching. this function does not do emission or modify throughput. 
  *
- * these also do not do emission or modify throughput. 
- * 
  * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
@@ -753,7 +766,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	sd->randb_closure = rphase*3.0f - channel;
 	float xi = rscatter;
 
-	/* probalistic scattering decision based on transmittance */
+	/* probabilistic scattering decision based on transmittance */
 	if(probalistic_scatter) {
 		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
 
@@ -833,7 +846,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
 
-		/* modifiy pdf for hit/miss decision */
+		/* modify pdf for hit/miss decision */
 		if(probalistic_scatter)
 			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
 
@@ -929,7 +942,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
 {
-	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
+	/* decoupled ray marching for heterogeneous volumes not supported on the GPU,
 	 * which also means equiangular and multiple importance sampling is not
 	 * support for that case */
 #ifdef __KERNEL_GPU__
@@ -958,7 +971,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          Ray *ray,
                                          VolumeStack *stack)
 {
-	/* NULL ray happens in the baker, does it need proper initializetion of
+	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
 	 */
 	if(!kernel_data.cam.is_inside_volume || ray == NULL) {
@@ -976,25 +989,26 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 		return;
 	}
 
-	const float3 Pend = ray->P + ray->D*ray->t;
 	Ray volume_ray = *ray;
-	int stack_index = 0, enclosed_index = 0;
-	int enclosed_volumes[VOLUME_STACK_SIZE];
+	volume_ray.t = FLT_MAX;
 
-	while(stack_index < VOLUME_STACK_SIZE - 1 &&
-	      enclosed_index < VOLUME_STACK_SIZE - 1)
-	{
-		Intersection isect;
-		bool hit = scene_intersect(kg, &volume_ray, PATH_RAY_ALL_VISIBILITY,
-		                           &isect,
-		                           NULL, 0.0f, 0.0f);
-		if(!hit) {
-			break;
-		}
+	int stack_index = 0, enclosed_index = 0;
 
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		if(sd.flag & SD_HAS_VOLUME) {
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		int enclosed_volumes[VOLUME_STACK_SIZE];
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
 			if(sd.flag & SD_BACKFACING) {
 				/* If ray exited the volume and never entered to that volume
 				 * it means that camera is inside such a volume.
@@ -1014,24 +1028,56 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 			}
 			else {
 				/* If ray from camera enters the volume, this volume shouldn't
-				 * be added to the stak on exit.
+				 * be added to the stack on exit.
 				 */
 				enclosed_volumes[enclosed_index++] = sd.object;
 			}
 		}
+	}
+#else
+	int enclosed_volumes[VOLUME_STACK_SIZE];
+	int step = 0;
 
-		/* Move ray forward. */
-		volume_ray.P = ray_offset(sd.P, -sd.Ng);
-		if(volume_ray.t != FLT_MAX) {
-			volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-			/* TODO(sergey): Find a faster way detecting that ray_offset moved
-			 * us pass through the end point.
+	while(stack_index < VOLUME_STACK_SIZE - 1 &&
+	      enclosed_index < VOLUME_STACK_SIZE - 1 &&
+	      step < 2 * VOLUME_STACK_SIZE)
+	{
+		Intersection isect;
+		if(!scene_intersect_volume(kg, &volume_ray, &isect)) {
+			break;
+		}
+
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		if(sd.flag & SD_BACKFACING) {
+			/* If ray exited the volume and never entered to that volume
+			 * it means that camera is inside such a volume.
 			 */
-			if(dot(ray->D, volume_ray.D) < 0.0f) {
-				break;
+			bool is_enclosed = false;
+			for(int i = 0; i < enclosed_index; ++i) {
+				if(enclosed_volumes[i] == sd.object) {
+					is_enclosed = true;
+					break;
+				}
 			}
+			if(is_enclosed == false) {
+				stack[stack_index].object = sd.object;
+				stack[stack_index].shader = sd.shader;
+				++stack_index;
+			}
+		}
+		else {
+			/* If ray from camera enters the volume, this volume shouldn't
+			 * be added to the stack on exit.
+			 */
+			enclosed_volumes[enclosed_index++] = sd.object;
 		}
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		++step;
 	}
+#endif
 	/* stack_index of 0 means quick checks outside of the kernel gave false
 	 * positive, nothing to worry about, just we've wasted quite a few of
 	 * ticks just to come into conclusion that camera is in the air.
@@ -1094,4 +1140,49 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 	}
 }
 
+#ifdef __SUBSURFACE__
+ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
+                                                          Ray *ray,
+                                                          VolumeStack *stack)
+{
+	kernel_assert(kernel_data.integrator.use_volumes);
+
+	Ray volume_ray = *ray;
+
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
+			kernel_volume_stack_enter_exit(kg, &sd, stack);
+		}
+	}
+#else
+	Intersection isect;
+	int step = 0;
+	while(step < 2 * VOLUME_STACK_SIZE &&
+	      scene_intersect_volume(kg, &volume_ray, &isect))
+	{
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		kernel_volume_stack_enter_exit(kg, &sd, stack);
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		volume_ray.t -= sd.ray_length;
+		++step;
+	}
+#endif
+}
+#endif
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
new file mode 100644
index 00000000000..9b83d972e97
--- /dev/null
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+                                 uint tile_dim_x,
+                                 uint tile_dim_y,
+                                 uint parallel_samples,
+                                 int dim)
+{
+	if(dim == 0) {
+		uint x_span = ray_index % (tile_dim_x * parallel_samples);
+		return x_span / get_local_size(0);
+	}
+	else /*if(dim == 1)*/ {
+		kernel_assert(dim == 1);
+		uint y_span = ray_index / (tile_dim_x * parallel_samples);
+		return y_span / get_local_size(1);
+	}
+}
+
+uint get_total_work(uint tile_dim_x,
+                    uint tile_dim_y,
+                    uint grp_idx,
+                    uint grp_idy,
+                    uint num_samples)
+{
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return threads_within_tile_border_x *
+	       threads_within_tile_border_y *
+	       num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+                  ccl_private uint *my_work,
+                  uint tile_dim_x,
+                  uint tile_dim_y,
+                  uint num_samples,
+                  uint parallel_samples,
+                  uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint total_work = get_total_work(tile_dim_x,
+	                                 tile_dim_y,
+	                                 grp_idx,
+	                                 grp_idy,
+	                                 num_samples);
+	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+	*my_work = atomic_inc(&work_pool[group_index]);
+	return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+                   uint tile_dim_x,
+                   uint tile_dim_y,
+                   uint parallel_samples,
+                   uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return my_work /
+	       (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+                             ccl_private uint *pixel_y,
+                             ccl_private uint *tile_x,
+                             ccl_private uint *tile_y,
+                             uint my_work,
+                             uint tile_dim_x,
+                             uint tile_dim_y,
+                             uint tile_offset_x,
+                             uint tile_offset_y,
+                             uint parallel_samples,
+                             uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	uint total_associated_pixels =
+		threads_within_tile_border_x * threads_within_tile_border_y;
+	uint work_group_pixel_index = my_work % total_associated_pixels;
+	uint work_group_pixel_x =
+		work_group_pixel_index % threads_within_tile_border_x;
+	uint work_group_pixel_y =
+		work_group_pixel_index / threads_within_tile_border_x;
+
+	*pixel_x =
+		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+	*pixel_y =
+		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+	*tile_x = *pixel_x - tile_offset_x;
+	*tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif  /* __WORK_STEALING__ */
+
+#endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index fa2113fbb46..2c8d3503c1a 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -11,18 +11,19 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* CPU kernel entry points */
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
@@ -37,7 +38,14 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s
 		assert(0);
 }
 
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
+void kernel_tex_copy(KernelGlobals *kg,
+                     const char *name,
+                     device_ptr mem,
+                     size_t width,
+                     size_t height,
+                     size_t depth,
+                     InterpolationType interpolation,
+                     ExtensionType extension)
 {
 	if(0) {
 	}
@@ -55,7 +63,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_float_"));
 		int array_index = id;
 
-		if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
 			tex = &kg->texture_float_images[array_index];
 		}
 
@@ -63,6 +71,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 			tex->data = (float4*)mem;
 			tex->dimensions_set(width, height, depth);
 			tex->interpolation = interpolation;
+			tex->extension = extension;
 		}
 	}
 	else if(strstr(name, "__tex_image")) {
@@ -70,7 +79,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_"));
 		int array_index = id - MAX_FLOAT_IMAGES;
 
-		if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
 			tex = &kg->texture_byte_images[array_index];
 		}
 
@@ -78,6 +87,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 			tex->data = (uchar4*)mem;
 			tex->dimensions_set(width, height, depth);
 			tex->interpolation = interpolation;
+			tex->extension = extension;
 		}
 	}
 	else
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index e7ff21a6f09..df77bedc729 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with AVX
@@ -31,13 +31,14 @@
  
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index cb1662bbfbe..b3192369794 100644
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with AVX2
@@ -32,13 +32,14 @@
  
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 740998e8c92..f9c5134e442 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE2
@@ -27,13 +27,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index da73a3a1c97..2dbe4b81821 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
@@ -29,13 +29,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 5704f60e138..5c57ad01181 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
@@ -30,13 +30,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 9ed4592f604..bcd55b8c676 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -11,18 +11,19 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* CUDA kernel entry points */
 
-#include "kernel_compat_cuda.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_compat_cuda.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
 
 /* device data taken from CUDA occupancy calculator */
 
@@ -52,6 +53,18 @@
 #define CUDA_KERNEL_MAX_REGISTERS 63
 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
 
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
 /* 5.0 and 5.2 */
 #elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 4f20ef9ca15..57db6fd9098 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -11,19 +11,22 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
 
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
+
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
@@ -32,7 +35,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -43,7 +46,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
 	int y = sy + get_global_id(1);
@@ -52,17 +55,18 @@ __kernel void kernel_ocl_path_trace(
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_convert_to_byte(
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -70,26 +74,24 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
-__kernel void kernel_ocl_convert_to_half_float(
+__kernel void kernel_ocl_bake(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -97,25 +99,30 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw) {
+#ifdef __NO_BAKING__
+		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+#endif
+	}
 }
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_convert_to_byte(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -123,24 +130,26 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_bake(
+__kernel void kernel_ocl_convert_to_half_float(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -148,11 +157,13 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+#endif // __COMPILE_ONLY_MEGAKERNEL__
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..eff77b89a0a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_background_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        ccl_global int *Queue_data,            /* Queues memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag =
+			kernel_background_buffer_update(globals,
+			                                data,
+			                                shader_data,
+			                                per_sample_output_buffers,
+			                                rng_state,
+			                                rng_coop,
+			                                throughput_coop,
+			                                PathRadiance_coop,
+			                                Ray_coop,
+			                                PathState_coop,
+			                                L_transparent_coop,
+			                                ray_state,
+			                                sw, sh, sx, sy, stride,
+			                                rng_state_offset_x,
+			                                rng_state_offset_y,
+			                                rng_state_stride,
+			                                work_array,
+			                                end_sample,
+			                                start_sample,
+#ifdef __WORK_STEALING__
+			                                work_pool_wgs,
+			                                num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+			                                debugdata_coop,
+#endif
+			                                parallel_samples,
+			                                ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
new file mode 100644
index 00000000000..c3277676029
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_data_init.h"
+
+__kernel void kernel_ocl_path_trace_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+	kernel_data_init(globals,
+	                 shader_data_sd,
+	                 shader_data_sd_DL_shadow,
+	                 P_sd,
+	                 P_sd_DL_shadow,
+	                 N_sd,
+	                 N_sd_DL_shadow,
+	                 Ng_sd,
+	                 Ng_sd_DL_shadow,
+	                 I_sd,
+	                 I_sd_DL_shadow,
+	                 shader_sd,
+	                 shader_sd_DL_shadow,
+	                 flag_sd,
+	                 flag_sd_DL_shadow,
+	                 prim_sd,
+	                 prim_sd_DL_shadow,
+	                 type_sd,
+	                 type_sd_DL_shadow,
+	                 u_sd,
+	                 u_sd_DL_shadow,
+	                 v_sd,
+	                 v_sd_DL_shadow,
+	                 object_sd,
+	                 object_sd_DL_shadow,
+	                 time_sd,
+	                 time_sd_DL_shadow,
+	                 ray_length_sd,
+	                 ray_length_sd_DL_shadow,
+	                 ray_depth_sd,
+	                 ray_depth_sd_DL_shadow,
+	                 transparent_depth_sd,
+	                 transparent_depth_sd_DL_shadow,
+
+	                 /* Ray differentials. */
+	                 dP_sd,
+	                 dP_sd_DL_shadow,
+	                 dI_sd,
+	                 dI_sd_DL_shadow,
+	                 du_sd,
+	                 du_sd_DL_shadow,
+	                 dv_sd,
+	                 dv_sd_DL_shadow,
+
+	                 /* Dp/Du */
+	                 dPdu_sd,
+	                 dPdu_sd_DL_shadow,
+	                 dPdv_sd,
+	                 dPdv_sd_DL_shadow,
+
+	                 /* Object motion. */
+	                 ob_tfm_sd,
+	                 ob_tfm_sd_DL_shadow,
+	                 ob_itfm_sd,
+	                 ob_itfm_sd_DL_shadow,
+
+	                 closure_sd,
+	                 closure_sd_DL_shadow,
+	                 num_closure_sd,
+	                 num_closure_sd_DL_shadow,
+	                 randb_closure_sd,
+	                 randb_closure_sd_DL_shadow,
+	                 ray_P_sd,
+	                 ray_P_sd_DL_shadow,
+	                 ray_dP_sd,
+	                 ray_dP_sd_DL_shadow,
+	                 data,
+	                 per_sample_output_buffers,
+	                 rng_state,
+	                 rng_coop,
+	                 throughput_coop,
+	                 L_transparent_coop,
+	                 PathRadiance_coop,
+	                 Ray_coop,
+	                 PathState_coop,
+	                 ray_state,
+
+#define KERNEL_TEX(type, ttype, name) name,
+#include "../../kernel_textures.h"
+
+	                 start_sample, sx, sy, sw, sh, offset, stride,
+	                 rng_state_offset_x,
+	                 rng_state_offset_y,
+	                 rng_state_stride,
+	                 Queue_data,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_array,
+#ifdef __WORK_STEALING__
+	                 work_pool_wgs,
+	                 num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                 debugdata_coop,
+#endif
+	                 parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..6ec75013b3a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_direct_lighting.h"
+
+__kernel void kernel_ocl_path_trace_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global int *Queue_data,             /* Queue memory */
+        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+        int queuesize)                          /* Size (capacity) of each queue */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_direct_lighting(globals,
+		                                      data,
+		                                      shader_data,
+		                                      shader_DL,
+		                                      rng_coop,
+		                                      PathState_coop,
+		                                      ISLamp_coop,
+		                                      LightRay_coop,
+		                                      BSDFEval_coop,
+		                                      ray_state,
+		                                      ray_index);
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..ae5f5cd1b3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		kernel_holdout_emission_blurring_pathtermination_ao(
+		        globals,
+		        data,
+		        shader_data,
+		        per_sample_output_buffers,
+		        rng_coop,
+		        throughput_coop,
+		        L_transparent_coop,
+		        PathRadiance_coop,
+		        PathState_coop,
+		        Intersection_coop,
+		        AOAlpha_coop,
+		        AOBSDF_coop,
+		        AOLightRay_coop,
+		        sw, sh, sx, sy, stride,
+		        ray_state,
+		        work_array,
+#ifdef __WORK_STEALING__
+		        start_sample,
+#endif
+		        parallel_samples,
+		        ray_index,
+		        &enqueue_flag,
+		        &enqueue_flag_AO_SHADOW_RAY_CAST);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics_bg,
+	                        Queue_data,
+	                        Queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        queuesize,
+	                        &local_queue_atomics_ao,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..1bc7808d834
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_lamp_emission.h"
+
+__kernel void kernel_ocl_path_trace_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_lamp_emission(globals,
+	                     data,
+	                     shader_data,
+	                     throughput_coop,
+	                     PathRadiance_coop,
+	                     Ray_coop,
+	                     PathState_coop,
+	                     Intersection_coop,
+	                     ray_state,
+	                     sw, sh,
+	                     use_queues_flag,
+	                     parallel_samples,
+	                     ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..dcf4db40411
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_next_iteration_setup.h"
+
+__kernel void kernel_ocl_path_trace_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global int *Queue_data,           /* Queue memory */
+        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
+        int queuesize,                        /* Size (capacity) of each queue */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_next_iteration_setup(globals,
+		                                           data,
+		                                           shader_data,
+		                                           rng_coop,
+		                                           throughput_coop,
+		                                           PathRadiance_coop,
+		                                           Ray_coop,
+		                                           PathState_coop,
+		                                           LightRay_dl_coop,
+		                                           ISLamp_coop,
+		                                           BSDFEval_coop,
+		                                           LightRay_ao_coop,
+		                                           AOBSDF_coop,
+		                                           AOAlpha_coop,
+		                                           ray_state,
+		                                           use_queues_flag,
+		                                           ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..3156dc255fb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_queues.h"
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+        ccl_global int *Queue_data,   /* Queue memory */
+        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+        ccl_global char *ray_state,   /* Denotes the state of each ray */
+        int queuesize)                /* Size (capacity) of each queue */
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  queuesize,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..e5fad7bce50
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_scene_intersect.h"
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_scene_intersect(globals,
+	                       data,
+	                       rng_coop,
+	                       Ray_coop,
+	                       PathState_coop,
+	                       Intersection_coop,
+	                       ray_state,
+	                       sw, sh,
+	                       use_queues_flag,
+#ifdef __KERNEL_DEBUG__
+	                       debugdata_coop,
+#endif
+	                       parallel_samples,
+	                       ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
new file mode 100644
index 00000000000..b9f616e6bdf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shader_eval.h"
+
+__kernel void kernel_ocl_path_trace_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global int *Queue_data,            /* queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize)                         /* Size (capacity) of each queue */
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+
+	/* Continue on with shader evaluation. */
+	kernel_shader_eval(globals,
+	                   data,
+	                   shader_data,
+	                   rng_coop,
+	                   Ray_coop,
+	                   PathState_coop,
+	                   Intersection_coop,
+	                   ray_state,
+	                   ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..03886c0a030
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shadow_blocked.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int total_num_rays)
+{
+#if 0
+	/* We will make the Queue_index entries '0' in the next kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	kernel_shadow_blocked(globals,
+	                      data,
+	                      shader_shadow,
+	                      PathState_coop,
+	                      LightRay_dl_coop,
+	                      LightRay_ao_coop,
+	                      Intersection_coop_AO,
+	                      Intersection_coop_DL,
+	                      ray_state,
+	                      total_num_rays,
+	                      shadow_blocked_type,
+	                      ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..88a1ed830af
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_sum_all_radiance.h"
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	kernel_sum_all_radiance(data,
+	                        buffer,
+	                        per_sample_output_buffer,
+	                        parallel_samples,
+	                        sw, sh, stride,
+	                        buffer_offset_x,
+	                        buffer_offset_y,
+	                        buffer_stride,
+	                        start_sample);
+}
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index 4685bb7753e..74ba5e1020c 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -38,11 +38,38 @@ incs.append(env['BF_OIIO_INC'])
 incs.append(env['BF_BOOST_INC'])
 incs.append(env['BF_OSL_INC'])
 incs.append(env['BF_OPENEXR_INC'].split())
+incs.append('#/intern/atomic')
 
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
 defs.append('WITH_OSL')
 
+if env['WITH_UNORDERED_MAP_SUPPORT']:
+    if env['UNORDERED_MAP_HEADER'] == 'unordered_map':
+        if env['UNORDERED_MAP_NAMESPACE'] == 'std':
+            defs.append('CYCLES_STD_UNORDERED_MAP')
+        elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+            defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE')
+    elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+        defs.append('CYCLES_TR1_UNORDERED_MAP')
+else:
+    print("-- Replacing unordered_map/set with map/set (warning: slower!)")
+    defs.append('CYCLES_NO_UNORDERED_MAP')
+
+if env['WITH_BF_CYCLES_DEBUG']:
+    defs.append('WITH_CYCLES_DEBUG')
+
+if env['WITH_BF_CYCLES_LOGGING']:
+    defs.append('WITH_CYCLES_LOGGING')
+    defs.append('GOOGLE_GLOG_DLL_DECL=')
+    defs.append('CYCLES_GFLAGS_NAMESPACE=gflags')
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+        incs.append('#extern/libmv/third_party/glog/src/windows')
+        incs.append('#extern/libmv/third_party/gflags')
+    else:
+        incs.append('#extern/libmv/third_party/glog/src')
+        incs.append('#extern/libmv/third_party/gflags')
+
 if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
     cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split())
     incs.append(env['BF_PTHREADS_INC'])
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 2facced0914..4d70bc80006 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -77,7 +77,7 @@ public:
 ClosureParam *closure_background_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GenericBackgroundClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GenericBackgroundClosure)
 	};
 	return params;
@@ -98,7 +98,7 @@ CCLOSURE_PREPARE(closure_holdout_prepare, HoldoutClosure)
 ClosureParam *closure_ambient_occlusion_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(AmbientOcclusionClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(AmbientOcclusionClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 8f9c2efd470..b3c71e4a706 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
@@ -92,7 +93,7 @@ ClosureParam *closure_bsdf_diffuse_ramp_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(DiffuseRampClosure, sc.N),
 		CLOSURE_COLOR_ARRAY_PARAM(DiffuseRampClosure, colors, 8),
-		CLOSURE_STRING_KEYPARAM("label"),
+		CLOSURE_STRING_KEYPARAM(DiffuseRampClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(DiffuseRampClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index c5851747b54..99f510d31ed 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
@@ -92,7 +93,7 @@ ClosureParam *closure_bsdf_phong_ramp_params()
 		CLOSURE_FLOAT3_PARAM(PhongRampClosure, sc.N),
 		CLOSURE_FLOAT_PARAM(PhongRampClosure, sc.data0),
 		CLOSURE_COLOR_ARRAY_PARAM(PhongRampClosure, colors, 8),
-		CLOSURE_STRING_KEYPARAM("label"),
+		CLOSURE_STRING_KEYPARAM(PhongRampClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(PhongRampClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 02935542c56..9a95fa57a81 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -77,7 +77,7 @@ public:
 ClosureParam *closure_emission_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GenericEmissiveClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GenericEmissiveClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 84ef85e089d..bc395922077 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_bssrdf.h"
 #include "osl_closures.h"
 
@@ -68,7 +69,7 @@ ClosureParam *closure_bssrdf_cubic_params()
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(CubicBSSRDFClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
 	};
 	return params;
@@ -96,7 +97,7 @@ ClosureParam *closure_bssrdf_gaussian_params()
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GaussianBSSRDFClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index d7789edcfff..461ce8f7598 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -54,7 +54,6 @@
 #include "closure/bsdf_refraction.h"
 #include "closure/bsdf_transparent.h"
 #include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_westin.h"
 #include "closure/bsdf_toon.h"
 #include "closure/bsdf_hair.h"
 #include "closure/volume.h"
@@ -87,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR)
 	CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(Refraction, refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter)
-
-BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen)
-
 BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR)
 BSDF_CLOSURE_CLASS_END(Transparent, transparent)
 
@@ -164,26 +153,16 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
-#else
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#endif
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission, LABEL_GLOSSY)
 	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1),
-#ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
-#else
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#endif
 BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission)
 
 VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, LABEL_VOLUME_SCATTER)
@@ -200,11 +179,7 @@ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, O
 	/* optimization: it's possible to not use a prepare function at all and
 	 * only initialize the actual class when accessing the closure component
 	 * data, but then we need to map the id to the class somehow */
-#ifdef CLOSURE_PREPARE
-	ss->register_closure(name, id, params, prepare, NULL, NULL);
-#else
-	ss->register_closure(name, id, params, prepare, NULL);
-#endif
+	ss->register_closure(name, id, params, prepare, NULL, 16);
 }
 
 void OSLShader::register_closures(OSLShadingSystem *ss_)
@@ -244,10 +219,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-	register_closure(ss, "westin_backscatter", id++,
-		bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare);
-	register_closure(ss, "westin_sheen", id++,
-		bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 58d215295dc..97bd1b1ac92 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -48,8 +48,6 @@ OSL::ClosureParam *closure_holdout_params();
 OSL::ClosureParam *closure_ambient_occlusion_params();
 OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
 OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_westin_backscatter_params();
-OSL::ClosureParam *closure_westin_sheen_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
@@ -60,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
 void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
@@ -82,6 +78,11 @@ void name(RendererServices *, int id, void *data) \
 #define TO_COLOR3(v) OSL::Color3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
 
+#if OSL_LIBRARY_VERSION_CODE < 10700
+#  undef CLOSURE_STRING_KEYPARAM
+#  define CLOSURE_STRING_KEYPARAM(st, fld, key) { TypeDesc::TypeString, 0, key, 0 }
+#endif
+
 /* Closure */
 
 class CClosurePrimitive {
@@ -101,6 +102,10 @@ public:
 	virtual void setup() {}
 
 	Category category;
+
+#if OSL_LIBRARY_VERSION_CODE >= 10700
+	OSL::ustring label;
+#endif
 };
 
 /* BSDF */
@@ -151,14 +156,14 @@ public: \
 \
 	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	int sample(const float3 &Ng, \
@@ -179,7 +184,7 @@ static ClosureParam *bsdf_##lower##_params() \
 /* parameters */
 
 #define BSDF_CLOSURE_CLASS_END(Upper, lower) \
-		CLOSURE_STRING_KEYPARAM("label"), \
+		CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \
 	    CLOSURE_FINISH_PARAM(Upper##Closure) \
 	}; \
 	return params; \
@@ -227,7 +232,7 @@ static ClosureParam *volume_##lower##_params() \
 /* parameters */
 
 #define VOLUME_CLOSURE_CLASS_END(Upper, lower) \
-		CLOSURE_STRING_KEYPARAM("label"), \
+		CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \
 	    CLOSURE_FINISH_PARAM(Upper##Closure) \
 	}; \
 	return params; \
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 5a658d8244a..e349ac676b0 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_GLOBALS_H__
@@ -20,7 +20,6 @@
 #ifdef WITH_OSL
 
 #include <OSL/oslexec.h>
-#include <cmath>
 
 #include "util_map.h"
 #include "util_param.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 1475e5a0a62..3c1955a1e1e 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -11,9 +11,18 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+/* TODO(sergey): There is a bit of headers dependency hell going on
+ * here, so for now we just put here. In the future it might be better
+ * to have dedicated file for such tweaks.
+ */
+#if defined(__GNUC__) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
 #include <string.h>
 
 #include "mesh.h"
@@ -130,12 +139,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm;
 
@@ -160,12 +169,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform itfm;
 
@@ -190,27 +199,27 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_world) {
+	else if(from == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -222,27 +231,27 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_world) {
+	else if(to == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -254,11 +263,11 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_tfm;
 #else
@@ -279,11 +288,11 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_itfm;
 #else
@@ -304,22 +313,22 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -332,22 +341,22 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 	
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -365,8 +374,8 @@ bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivat
 
 static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 
@@ -374,7 +383,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		fval[1] = f[0].y;
 		fval[2] = f[0].z;
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1].x;
 			fval[4] = f[1].y;
 			fval[5] = f[1].z;
@@ -390,7 +399,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		float *fval = (float *)val;
 		fval[0] = average(f[0]);
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = average(f[1]);
 			fval[2] = average(f[2]);
 		}
@@ -414,15 +423,15 @@ static bool set_attribute_float3(float3 f, TypeDesc type, bool derivatives, void
 
 static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 		fval[0] = f[0];
 		fval[1] = f[1];
 		fval[2] = f[2];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1];
 			fval[4] = f[1];
 			fval[5] = f[1];
@@ -438,7 +447,7 @@ static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, voi
 		float *fval = (float *)val;
 		fval[0] = f[0];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = f[1];
 			fval[2] = f[2];
 		}
@@ -466,7 +475,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
 		int *ival = (int *)val;
 		ival[0] = i;
 
-		if (derivatives) {
+		if(derivatives) {
 			ival[1] = 0;
 			ival[2] = 0;
 		}
@@ -483,7 +492,7 @@ static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, v
 		ustring *sval = (ustring *)val;
 		sval[0] = str;
 
-		if (derivatives) {
+		if(derivatives) {
 			sval[1] = OSLRenderServices::u_empty;
 			sval[2] = OSLRenderServices::u_empty;
 		}
@@ -513,7 +522,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives,
 
 		if(type.arraylen > 3)
 			memset(fval + 3*3, 0, sizeof(float)*3*(type.arraylen - 3));
-		if (derivatives)
+		if(derivatives)
 			memset(fval + type.arraylen*3, 0, sizeof(float)*2*3*type.arraylen);
 
 		return true;
@@ -536,15 +545,15 @@ static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val)
 static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-	    attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
+	if(attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
+	   attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
 	{
 		float3 fval[3];
 		fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset,
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float3(fval, type, derivatives, val);
 	}
-	else if (attr.type == TypeDesc::TypeFloat) {
+	else if(attr.type == TypeDesc::TypeFloat) {
 		float fval[3];
 		fval[0] = primitive_attribute_float(kg, sd, attr.elem, attr.offset,
 		                                    (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
@@ -558,7 +567,7 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd,
 static bool get_mesh_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypeMatrix) {
+	if(attr.type == TypeDesc::TypeMatrix) {
 		Transform tfm = primitive_attribute_matrix(kg, sd, attr.offset);
 		return set_attribute_matrix(tfm, type, val);
 	}
@@ -572,7 +581,7 @@ static void get_object_attribute(const OSLGlobals::Attribute& attr, bool derivat
 	size_t datasize = attr.value.datasize();
 
 	memcpy(val, attr.value.data(), datasize);
-	if (derivatives)
+	if(derivatives)
 		memset((char *)val + datasize, 0, datasize * 2);
 }
 
@@ -582,80 +591,80 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 	/* todo: turn this into hash table? */
 
 	/* Object Attributes */
-	if (name == u_object_location) {
+	if(name == u_object_location) {
 		float3 f = object_location(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_object_index) {
+	else if(name == u_object_index) {
 		float f = object_pass_id(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_generated) {
+	else if(name == u_geom_dupli_generated) {
 		float3 f = object_dupli_generated(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_uv) {
+	else if(name == u_geom_dupli_uv) {
 		float3 f = object_dupli_uv(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_material_index) {
+	else if(name == u_material_index) {
 		float f = shader_pass_id(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_object_random) {
+	else if(name == u_object_random) {
 		float f = object_random_number(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 
 	/* Particle Attributes */
-	else if (name == u_particle_index) {
+	else if(name == u_particle_index) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_index(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_age) {
+	else if(name == u_particle_age) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_age(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_lifetime) {
+	else if(name == u_particle_lifetime) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_lifetime(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_location) {
+	else if(name == u_particle_location) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_location(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 #if 0	/* unsupported */
-	else if (name == u_particle_rotation) {
+	else if(name == u_particle_rotation) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float4 f = particle_rotation(kg, particle_id);
 		return set_attribute_float4(f, type, derivatives, val);
 	}
 #endif
-	else if (name == u_particle_size) {
+	else if(name == u_particle_size) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_size(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_velocity) {
+	else if(name == u_particle_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_particle_angular_velocity) {
+	else if(name == u_particle_angular_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_angular_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 	
 	/* Geometry Attributes */
-	else if (name == u_geom_numpolyvertices) {
+	else if(name == u_geom_numpolyvertices) {
 		return set_attribute_int(3, type, derivatives, val);
 	}
-	else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices)
+	else if((name == u_geom_trianglevertices || name == u_geom_polyvertices)
 #ifdef __HAIR__
 		     && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
@@ -681,21 +690,21 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		ustring object_name = kg->osl->object_names[sd->object];
 		return set_attribute_string(object_name, type, derivatives, val);
 	}
-	else if (name == u_is_smooth) {
+	else if(name == u_is_smooth) {
 		float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 #ifdef __HAIR__
 	/* Hair Attributes */
-	else if (name == u_is_curve) {
+	else if(name == u_is_curve) {
 		float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_thickness) {
+	else if(name == u_curve_thickness) {
 		float f = curve_thickness(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_tangent_normal) {
+	else if(name == u_curve_tangent_normal) {
 		float3 f = curve_tangent_normal(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
@@ -707,22 +716,22 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
                                                  TypeDesc type, bool derivatives, void *val)
 {
-	if (name == u_path_ray_length) {
+	if(name == u_path_ray_length) {
 		/* Ray Length */
 		float f = sd->ray_length;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_path_ray_depth) {
+	else if(name == u_path_ray_depth) {
 		/* Ray Depth */
 		int f = sd->ray_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_path_transparent_depth) {
+	else if(name == u_path_transparent_depth) {
 		/* Transparent Ray Depth */
 		int f = sd->transparent_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_ndc) {
+	else if(name == u_ndc) {
 		/* NDC coordinates with special exception for otho */
 		OSLThreadData *tdata = kg->osl_tdata;
 		OSL::ShaderGlobals *globals = &tdata->globals;
@@ -754,7 +763,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if (sg->renderstate == NULL)
+	if(sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -769,10 +778,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	int object;
 
 	/* lookup of attribute on another object */
-	if (object_name != u_empty) {
+	if(object_name != u_empty) {
 		OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name);
 
-		if (it == kg->osl->object_name_map.end())
+		if(it == kg->osl->object_name_map.end())
 			return false;
 
 		object = it->second;
@@ -782,7 +791,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		object = sd->object;
 		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 
-		if (object == OBJECT_NONE)
+		if(object == OBJECT_NONE)
 			return get_background_attribute(kg, sd, name, type, derivatives, val);
 	}
 
@@ -791,10 +800,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object];
 	OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
 
-	if (it != attribute_map.end()) {
+	if(it != attribute_map.end()) {
 		const OSLGlobals::Attribute& attr = it->second;
 
-		if (attr.elem != ATTR_ELEMENT_OBJECT) {
+		if(attr.elem != ATTR_ELEMENT_OBJECT) {
 			/* triangle and vertex attributes */
 			if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val))
 				return true;
@@ -811,7 +820,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		/* not found in attribute, check standard object info */
 		bool is_std_object_attribute = get_object_standard_attribute(kg, sd, name, type, derivatives, val);
 
-		if (is_std_object_attribute)
+		if(is_std_object_attribute)
 			return true;
 
 		return get_background_attribute(kg, sd, name, type, derivatives, val);
@@ -834,7 +843,7 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo
 bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
                                 OSL::ShaderGlobals *sg,
                                 float s, float t, float dsdx, float dtdx,
-                                float dsdy, float dtdy, float *result)
+                                float dsdy, float dtdy, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -869,9 +878,9 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 		PtexFilter::Options opts(PtexFilter::f_bicubic, mipmaplerp, sharpness);
 		PtexPtr<PtexFilter> f(PtexFilter::getFilter(r, opts));
 
-		f->eval(result, options.firstchannel, options.nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
+		f->eval(result, options.firstchannel, nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
 
-		for(int c = r->numChannels(); c < options.nchannels; c++)
+		for(int c = r->numChannels(); c < nchannels; c++)
 			result[c] = result[0];
 
 		return true;
@@ -879,16 +888,16 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 #endif
 	bool status;
 
-	if(filename[0] == '@' && filename.find('.') == -1) {
-        int slot = atoi(filename.c_str() + 1);
+	if(filename[0] == '@') {
+		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
 		result[0] = rgba[0];
-		if(options.nchannels > 1)
+		if(nchannels > 1)
 			result[1] = rgba[1];
-		if(options.nchannels > 2)
+		if(nchannels > 2)
 			result[2] = rgba[2];
-		if(options.nchannels > 3)
+		if(nchannels > 3)
 			result[3] = rgba[3];
 		status = true;
 	}
@@ -898,17 +907,24 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 
 		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
 
+#if OIIO_VERSION < 10500
 		status = ts->texture(th, thread_info,
-		                     options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     result);
+#else
+		status = ts->texture(th, thread_info,
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     nchannels, result);
+#endif
 	}
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
@@ -919,26 +935,46 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
                                   OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
                                   const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-                                  const OSL::Vec3 &dPdz, float *result)
+                                  const OSL::Vec3 &dPdz, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
 	KernelGlobals *kg = sd->osl_globals;
-	OSLThreadData *tdata = kg->osl_tdata;
-	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
-
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	bool status;
+	if(filename[0] == '@') {
+		int slot = atoi(filename.c_str() + 1);
+		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
 
-	bool status = ts->texture3d(th, thread_info,
-	                            options, P, dPdx, dPdy, dPdz, result);
+		result[0] = rgba[0];
+		if(nchannels > 1)
+			result[1] = rgba[1];
+		if(nchannels > 2)
+			result[2] = rgba[2];
+		if(nchannels > 3)
+			result[3] = rgba[3];
+		status = true;
+	}
+	else {
+		OSLThreadData *tdata = kg->osl_tdata;
+		OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
+		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+#if OIIO_VERSION < 10500
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz, result);
+#else
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz,
+		                       nchannels, result);
+#endif
+	}
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 
@@ -949,7 +985,8 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
 
 bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
                                     OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result)
+                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+                                    int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -957,17 +994,24 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	OSLThreadData *tdata = kg->osl_tdata;
 	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
 
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+
+#if OIIO_VERSION < 10500
 	bool status = ts->environment(th, thread_info,
 	                              options, R, dRdx, dRdy, result);
+#else
+	bool status = ts->environment(th, thread_info,
+	                              options, R, dRdx, dRdy,
+	                              nchannels, result);
+#endif
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
@@ -1018,7 +1062,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 
 	ray.P = TO_FLOAT3(P);
 	ray.D = TO_FLOAT3(R);
-	ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist;
+	ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist;
 	ray.time = sd->time;
 
 	if(options.mindist == 0.0f) {
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 6f928a0d103..cb6f2311ad8 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_SERVICES_H__
@@ -97,16 +97,64 @@ public:
 	bool texture(ustring filename, TextureOpt &options,
 	             OSL::ShaderGlobals *sg,
 	             float s, float t, float dsdx, float dtdx,
-	             float dsdy, float dtdy, float *result);
+	             float dsdy, float dtdy, int nchannels, float *result);
 
 	bool texture3d(ustring filename, TextureOpt &options,
 	               OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
 	               const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-	               const OSL::Vec3 &dPdz, float *result);
+	               const OSL::Vec3 &dPdz, int nchannels, float *result);
+
+#if OSL_LIBRARY_VERSION_CODE >= 10700
+	bool texture(ustring filename,
+	             TextureHandle * /*texture_handle*/,
+	             TexturePerthread * /*texture_thread_info*/,
+	             TextureOpt &options,
+	             OSL::ShaderGlobals *sg,
+	             float s, float t,
+	             float dsdx, float dtdx, float dsdy, float dtdy,
+	             int nchannels,
+	             float *result,
+	             float * /*dresultds*/,
+	             float * /*dresultdt*/)
+	{
+		return texture(filename,
+		               options,
+		               sg,
+		               s, t,
+		               dsdx, dtdx, dsdy, dtdy,
+		               nchannels,
+		               result);
+	}
+
+	bool texture3d(ustring filename,
+	               TextureHandle * /*texture_handle*/,
+	               TexturePerthread * /*texture_thread_info*/,
+	               TextureOpt &options,
+	               OSL::ShaderGlobals *sg,
+	               const OSL::Vec3 &P,
+	               const OSL::Vec3 &dPdx,
+	               const OSL::Vec3 &dPdy,
+	               const OSL::Vec3 &dPdz,
+	               int nchannels,
+	               float *result,
+	               float * /*dresultds*/,
+	               float * /*dresultdt*/,
+	               float * /*dresultdr*/)
+	{
+		return texture3d(filename,
+		                 options,
+		                 sg,
+		                 P,
+		                 dPdx, dPdy, dPdz,
+		                 nchannels,
+		                 result);
+	}
+#endif
 
 	bool environment(ustring filename, TextureOpt &options,
 	                 OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
+	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                 int nchannels, float *result);
 
 	bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
 	                      ustring dataname, TypeDesc datatype, void *data);
@@ -159,70 +207,37 @@ public:
 	static ustring u_v;
 	static ustring u_empty;
 
-#if OSL_LIBRARY_VERSION_CODE < 10500
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_matrix(NULL, result, xform, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_inverse_matrix(NULL, result, xform, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
-		return get_matrix(NULL, result, from, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
-		return get_inverse_matrix(NULL, result, to, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_matrix(NULL, result, xform);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_inverse_matrix(NULL, result, xform);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, ustring from) {
-		return get_matrix(NULL, result, from);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
-		return get_inverse_matrix(NULL, result, to);
-	}
+	/* Code to make OSL versions transition smooth. */
 
-	bool get_array_attribute(void *renderstate, bool derivatives,
-	                         ustring object, TypeDesc type, ustring name,
-	                         int index, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_array_attribute(&sg, derivatives,
-		                           object, type, name,
-		                           index, val);
-	}
-
-	bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
-	                   TypeDesc type, ustring name, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_attribute(&sg, derivatives, object_name, type, name, val);
+#if OSL_LIBRARY_VERSION_CODE < 10600
+	inline bool texture(ustring filename, TextureOpt &options,
+	                    OSL::ShaderGlobals *sg,
+	                    float s, float t, float dsdx, float dtdx,
+	                    float dsdy, float dtdy, float *result)
+	{
+		return texture(filename, options, sg, s, t, dsdx, dtdx, dsdy, dtdy,
+		               options.nchannels, result);
 	}
 
-	bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
-		return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
+	inline bool texture3d(ustring filename, TextureOpt &options,
+	                      OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
+	                      const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
+	                      const OSL::Vec3 &dPdz, float *result)
+	{
+		return texture3d(filename, options, sg, P, dPdx, dPdy, dPdz,
+		                 options.nchannels, result);
 	}
 
-	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
-	                  void *renderstate, void *val) {
-		return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
-	}
-
-	bool get_texture_info(ustring filename, int subimage,
-	                      ustring dataname, TypeDesc datatype, void *data) {
-		return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+	inline bool environment(ustring filename, TextureOpt &options,
+	                        OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
+	                        const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                        float *result)
+	{
+		return environment(filename, options, sg, R, dRdx, dRdy,
+		                   options.nchannels, result);
 	}
 #endif
+
 private:
 	KernelGlobals *kernel_globals;
 	OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 48498116874..2f234aa25ea 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -11,9 +11,11 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+#include <OSL/oslexec.h>
+
 #include "kernel_compat_cpu.h"
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
@@ -34,7 +36,6 @@
 
 #include "attribute.h"
 
-#include <OSL/oslexec.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -145,162 +146,175 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
+#else
+	switch(closure->id) {
+#endif
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+			flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
+			break;
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
+			flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
+			break;
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
-			ShaderClosure sc;
+			if(prim) {
+				ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			weight = weight*TO_FLOAT3(comp->w);
+				weight = weight*TO_FLOAT3(comp->w);
 #endif
-			sc.weight = weight;
+				sc.weight = weight;
 
-			prim->setup();
+				prim->setup();
 
-			switch (prim->category) {
-				case CClosurePrimitive::BSDF: {
-					CBSDFClosure *bsdf = (CBSDFClosure *)prim;
-					int scattering = bsdf->scattering();
+				switch(prim->category) {
+					case CClosurePrimitive::BSDF: {
+						CBSDFClosure *bsdf = (CBSDFClosure *)prim;
+						int scattering = bsdf->scattering();
 
-					/* caustic options */
-					if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-						KernelGlobals *kg = sd->osl_globals;
+						/* caustic options */
+						if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
+							KernelGlobals *kg = sd->osl_globals;
 
-						if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
-						   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
-							return;
+							if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+							   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT)))
+							{
+								return;
+							}
 						}
-					}
 
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
 
-					sc.sample_weight = sample_weight;
+						sc.sample_weight = sample_weight;
 
-					sc.type = bsdf->sc.type;
-					sc.N = bsdf->sc.N;
-					sc.T = bsdf->sc.T;
-					sc.data0 = bsdf->sc.data0;
-					sc.data1 = bsdf->sc.data1;
-					sc.data2 = bsdf->sc.data2;
-					sc.prim = bsdf->sc.prim;
+						sc.type = bsdf->sc.type;
+						sc.N = bsdf->sc.N;
+						sc.T = bsdf->sc.T;
+						sc.data0 = bsdf->sc.data0;
+						sc.data1 = bsdf->sc.data1;
+						sc.data2 = bsdf->sc.data2;
+						sc.prim = bsdf->sc.prim;
 
-					/* add */
-					if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= bsdf->shaderdata_flag();
-					}
-					break;
-				}
-				case CClosurePrimitive::Emissive: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_EMISSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					/* flag */
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_EMISSION;
-					}
-					break;
-				}
-				case CClosurePrimitive::AmbientOcclusion: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_AO;
-					}
-					break;
-				}
-				case CClosurePrimitive::Holdout: {
-					sc.sample_weight = 0.0f;
-					sc.type = CLOSURE_HOLDOUT_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_HOLDOUT;
+						/* add */
+						if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= bsdf->shaderdata_flag();
+						}
+						break;
 					}
-					break;
-				}
-				case CClosurePrimitive::BSSRDF: {
-					CBSSRDFClosure *bssrdf = (CBSSRDFClosure *)prim;
-					float sample_weight = fabsf(average(weight));
+					case CClosurePrimitive::Emissive: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
 
-					if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
 						sc.sample_weight = sample_weight;
-
-						sc.type = bssrdf->sc.type;
-						sc.N = bssrdf->sc.N;
-						sc.data1 = bssrdf->sc.data1;
-						sc.T.x = bssrdf->sc.T.x;
+						sc.type = CLOSURE_EMISSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
 						sc.prim = NULL;
 
-						/* disable in case of diffuse ancestor, can't see it well then and
-						 * adds considerably noise due to probabilities of continuing path
-						 * getting lower and lower */
-						if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
-							bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
-
-						/* create one closure for each color channel */
-						if(fabsf(weight.x) > 0.0f) {
-							sc.weight = make_float3(weight.x, 0.0f, 0.0f);
-							sc.data0 = bssrdf->radius.x;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						/* flag */
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_EMISSION;
 						}
+						break;
+					}
+					case CClosurePrimitive::AmbientOcclusion: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
+						sc.prim = NULL;
 
-						if(fabsf(weight.y) > 0.0f) {
-							sc.weight = make_float3(0.0f, weight.y, 0.0f);
-							sc.data0 = bssrdf->radius.y;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_AO;
 						}
+						break;
+					}
+					case CClosurePrimitive::Holdout: {
+						sc.sample_weight = 0.0f;
+						sc.type = CLOSURE_HOLDOUT_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
+						sc.prim = NULL;
 
-						if(fabsf(weight.z) > 0.0f) {
-							sc.weight = make_float3(0.0f, 0.0f, weight.z);
-							sc.data0 = bssrdf->radius.z;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_HOLDOUT;
+						}
+						break;
+					}
+					case CClosurePrimitive::BSSRDF: {
+						CBSSRDFClosure *bssrdf = (CBSSRDFClosure *)prim;
+						float sample_weight = fabsf(average(weight));
+
+						if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+							sc.sample_weight = sample_weight;
+
+							sc.type = bssrdf->sc.type;
+							sc.N = bssrdf->sc.N;
+							sc.data1 = bssrdf->sc.data1;
+							sc.T.x = bssrdf->sc.T.x;
+							sc.prim = NULL;
+
+							/* disable in case of diffuse ancestor, can't see it well then and
+							 * adds considerably noise due to probabilities of continuing path
+							 * getting lower and lower */
+							if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
+								bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
+
+							/* create one closure for each color channel */
+							if(fabsf(weight.x) > 0.0f) {
+								sc.weight = make_float3(weight.x, 0.0f, 0.0f);
+								sc.data0 = bssrdf->radius.x;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
+
+							if(fabsf(weight.y) > 0.0f) {
+								sc.weight = make_float3(0.0f, weight.y, 0.0f);
+								sc.data0 = bssrdf->radius.y;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
+
+							if(fabsf(weight.z) > 0.0f) {
+								sc.weight = make_float3(0.0f, 0.0f, weight.z);
+								sc.data0 = bssrdf->radius.z;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
 						}
+						break;
 					}
-					break;
+					case CClosurePrimitive::Background:
+					case CClosurePrimitive::Volume:
+						break; /* not relevant */
 				}
-				case CClosurePrimitive::Background:
-				case CClosurePrimitive::Volume:
-					break; /* not relevant */
 			}
+			break;
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-		flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-		flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
-		flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
-	}
 }
 
 void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
@@ -315,11 +329,11 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, S
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->surface_state[shader])
+	if(kg->osl->surface_state[shader])
 		ss->execute(*octx, *(kg->osl->surface_state[shader]), *globals);
 
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_surface_closure_tree(sd, path_flag, globals->Ci);
 }
 
@@ -331,27 +345,33 @@ static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure)
 	 * is only one supported closure type at the moment, which has no evaluation
 	 * functions, so we just sum the weights */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
-
-		if (prim && prim->category == CClosurePrimitive::Background)
-#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			return TO_FLOAT3(comp->w);
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
 #else
-			return make_float3(1.0f, 1.0f, 1.0f);
+	switch(closure->id) {
 #endif
-	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
 
-		return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 
-		return flatten_background_closure_tree(add->closureA) +
-		       flatten_background_closure_tree(add->closureB);
+			return flatten_background_closure_tree(add->closureA) +
+			       flatten_background_closure_tree(add->closureB);
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+
+			if(prim && prim->category == CClosurePrimitive::Background)
+#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
+				return TO_FLOAT3(comp->w);
+#else
+				return make_float3(1.0f, 1.0f, 1.0f);
+#endif
+		}
 	}
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -368,11 +388,11 @@ float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, int path_fl
 	OSL::ShaderGlobals *globals = &tdata->globals;
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 
-	if (kg->osl->background_state)
+	if(kg->osl->background_state)
 		ss->execute(*octx, *(kg->osl->background_state), *globals);
 
 	/* return background color immediately */
-	if (globals->Ci)
+	if(globals->Ci)
 		return flatten_background_closure_tree(globals->Ci);
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -386,76 +406,84 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
+#else
+	switch(closure->id) {
+#endif
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+			flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
+			break;
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			flatten_volume_closure_tree(sd, add->closureA, weight);
+			flatten_volume_closure_tree(sd, add->closureB, weight);
+			break;
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
-			ShaderClosure sc;
+			if(prim) {
+				ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			weight = weight*TO_FLOAT3(comp->w);
+				weight = weight*TO_FLOAT3(comp->w);
 #endif
-			sc.weight = weight;
-
-			prim->setup();
-
-			switch (prim->category) {
-				case CClosurePrimitive::Volume: {
-					CVolumeClosure *volume = (CVolumeClosure *)prim;
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = volume->sc.type;
-					sc.data0 = volume->sc.data0;
-					sc.data1 = volume->sc.data1;
-
-					/* add */
-					if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
-					   (sd->num_closure < MAX_CLOSURE))
-					{
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= volume->shaderdata_flag();
+				sc.weight = weight;
+
+				prim->setup();
+
+				switch(prim->category) {
+					case CClosurePrimitive::Volume: {
+						CVolumeClosure *volume = (CVolumeClosure *)prim;
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = volume->sc.type;
+						sc.data0 = volume->sc.data0;
+						sc.data1 = volume->sc.data1;
+
+						/* add */
+						if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
+						   (sd->num_closure < MAX_CLOSURE))
+						{
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= volume->shaderdata_flag();
+						}
+						break;
 					}
-					break;
-				}
-				case CClosurePrimitive::Emissive: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_EMISSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.prim = NULL;
-
-					/* flag */
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_EMISSION;
+					case CClosurePrimitive::Emissive: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = CLOSURE_EMISSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.prim = NULL;
+
+						/* flag */
+						if(sd->num_closure < MAX_CLOSURE) {
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_EMISSION;
+						}
+						break;
 					}
-					break;
+					case CClosurePrimitive::Holdout:
+						break; /* not implemented */
+					case CClosurePrimitive::Background:
+					case CClosurePrimitive::BSDF:
+					case CClosurePrimitive::BSSRDF:
+					case CClosurePrimitive::AmbientOcclusion:
+						break; /* not relevant */
 				}
-				case CClosurePrimitive::Holdout:
-					break; /* not implemented */
-				case CClosurePrimitive::Background:
-				case CClosurePrimitive::BSDF:
-				case CClosurePrimitive::BSSRDF:
-				case CClosurePrimitive::AmbientOcclusion:
-					break; /* not relevant */
 			}
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-		flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-		flatten_volume_closure_tree(sd, add->closureA, weight);
-		flatten_volume_closure_tree(sd, add->closureB, weight);
-	}
 }
 
 void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
@@ -470,11 +498,11 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, Sh
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->volume_state[shader])
+	if(kg->osl->volume_state[shader])
 		ss->execute(*octx, *(kg->osl->volume_state[shader]), *globals);
 	
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_volume_closure_tree(sd, globals->Ci);
 }
 
@@ -492,7 +520,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->displacement_state[shader])
+	if(kg->osl->displacement_state[shader])
 		ss->execute(*octx, *(kg->osl->displacement_state[shader]), *globals);
 
 	/* get back position */
@@ -519,7 +547,7 @@ float3 OSLShader::bsdf_eval(const ShaderData *sd, const ShaderClosure *sc, const
 	CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim;
 	float3 bsdf_eval;
 
-	if (dot(sd->Ng, omega_in) >= 0.0f)
+	if(dot(sd->Ng, omega_in) >= 0.0f)
 		bsdf_eval = bsdf->eval_reflect(sd->I, omega_in, pdf);
 	else
 		bsdf_eval = bsdf->eval_transmit(sd->I, omega_in, pdf);
@@ -547,7 +575,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 	ustring stdname(std::string("geom:") + std::string(Attribute::standard_name((AttributeStandard)id)));
 	OSLGlobals::AttributeMap::const_iterator it = attr_map.find(stdname);
 
-	if (it != attr_map.end()) {
+	if(it != attr_map.end()) {
 		const OSLGlobals::Attribute &osl_attr = it->second;
 		*elem = osl_attr.elem;
 
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 40f50d8b0e9..15dd74f9d38 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_SHADER_H__
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 0b735ede701..81931463cad 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SRC_OSL
 	node_vector_transform.osl
 	node_velvet_bsdf.osl
 	node_voronoi_texture.osl
+	node_voxel_texture.osl
 	node_wavelength.osl
 	node_blackbody.osl
 	node_wave_texture.osl
diff --git a/intern/cycles/kernel/shaders/node_absorption_volume.osl b/intern/cycles/kernel/shaders/node_absorption_volume.osl
index 6bac83ba4f5..18f662ebbbd 100644
--- a/intern/cycles/kernel/shaders/node_absorption_volume.osl
+++ b/intern/cycles/kernel/shaders/node_absorption_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_add_closure.osl b/intern/cycles/kernel/shaders/node_add_closure.osl
index b826fb22784..b6596e0b6bd 100644
--- a/intern/cycles/kernel/shaders/node_add_closure.osl
+++ b/intern/cycles/kernel/shaders/node_add_closure.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
index 961aed1016b..5f056122bbe 100644
--- a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
+++ b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index da1e4f77107..281ed4e8726 100644
--- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_attribute.osl b/intern/cycles/kernel/shaders/node_attribute.osl
index 43f69fab053..67183e9ffe0 100644
--- a/intern/cycles/kernel/shaders/node_attribute.osl
+++ b/intern/cycles/kernel/shaders/node_attribute.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_background.osl b/intern/cycles/kernel/shaders/node_background.osl
index c4379a8f71b..613d4e360fa 100644
--- a/intern/cycles/kernel/shaders/node_background.osl
+++ b/intern/cycles/kernel/shaders/node_background.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_blackbody.osl b/intern/cycles/kernel/shaders/node_blackbody.osl
index d26e56ab06d..1da6894d0f0 100644
--- a/intern/cycles/kernel/shaders/node_blackbody.osl
+++ b/intern/cycles/kernel/shaders/node_blackbody.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 70a6a6ea7ce..35e01178ba8 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -22,6 +22,7 @@
 float brick_noise(int n) /* fast integer noise */
 {
 	int nn;
+	n = (n + 1013) & 2147483647;
 	n = (n >> 13) ^ n;
 	nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 2147483647;
 	return 0.5 * ((float)nn / 1073741824.0);
@@ -87,12 +88,9 @@ shader node_brick_texture(
 		
 	if (Fac != 1.0) {
 		float facm = 1.0 - tint;
-
-		Col[0] = facm * (Color1[0]) + tint * Color2[0];
-		Col[1] = facm * (Color1[1]) + tint * Color2[1];
-		Col[2] = facm * (Color1[2]) + tint * Color2[2];
+		Col = facm * Color1 + tint * Color2;
 	}
 	
-	Color = (Fac == 1.0) ? Mortar: Col;
+	Color = (Fac == 1.0) ? Mortar : Col;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_brightness.osl b/intern/cycles/kernel/shaders/node_brightness.osl
index 468b0f052c3..00cfb167885 100644
--- a/intern/cycles/kernel/shaders/node_brightness.osl
+++ b/intern/cycles/kernel/shaders/node_brightness.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_bump.osl b/intern/cycles/kernel/shaders/node_bump.osl
index bbc08760cd5..9882857f2ec 100644
--- a/intern/cycles/kernel/shaders/node_bump.osl
+++ b/intern/cycles/kernel/shaders/node_bump.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_camera.osl b/intern/cycles/kernel/shaders/node_camera.osl
index 20ebb7dc095..5e90cb8b8ee 100644
--- a/intern/cycles/kernel/shaders/node_camera.osl
+++ b/intern/cycles/kernel/shaders/node_camera.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index a6d21fd36f3..ae84c71dd42 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_color.h b/intern/cycles/kernel/shaders/node_color.h
index 095e628f20c..4a17286a07f 100644
--- a/intern/cycles/kernel/shaders/node_color.h
+++ b/intern/cycles/kernel/shaders/node_color.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 float color_srgb_to_scene_linear(float c)
diff --git a/intern/cycles/kernel/shaders/node_combine_hsv.osl b/intern/cycles/kernel/shaders/node_combine_hsv.osl
index 010773acc5c..6b922bf4e6b 100644
--- a/intern/cycles/kernel/shaders/node_combine_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_combine_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_combine_rgb.osl b/intern/cycles/kernel/shaders/node_combine_rgb.osl
index 8466a89b536..f343fdefd84 100644
--- a/intern/cycles/kernel/shaders/node_combine_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_combine_rgb.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
index 933dee5bd78..86182056b09 100644
--- a/intern/cycles/kernel/shaders/node_combine_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_color.osl b/intern/cycles/kernel/shaders/node_convert_from_color.osl
index 2f4503e66e3..44074317f42 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_color.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_color.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_float.osl b/intern/cycles/kernel/shaders/node_convert_from_float.osl
index f5b91903078..fc5c79c4c64 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_float.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_float.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_int.osl b/intern/cycles/kernel/shaders/node_convert_from_int.osl
index 110922a5df1..3c3785ebc0d 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_int.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_int.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_normal.osl b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
index 995c86d8828..8ecc56ac8ce 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_normal.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_point.osl b/intern/cycles/kernel/shaders/node_convert_from_point.osl
index 2ed151273a8..e5913b7a1e4 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_point.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_point.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_string.osl b/intern/cycles/kernel/shaders/node_convert_from_string.osl
index 50cce252be4..0466734277b 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_string.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_string.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_vector.osl b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
index 035c46625a0..79c5cb04550 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_vector.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
index e8c94660e4f..2bef2d65baa 100644
--- a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index b28d731c19f..c36e2a4c0f3 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_environment_texture.osl b/intern/cycles/kernel/shaders/node_environment_texture.osl
index 136ccdf8b18..14f0226a0e5 100644
--- a/intern/cycles/kernel/shaders/node_environment_texture.osl
+++ b/intern/cycles/kernel/shaders/node_environment_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h
index 9f10ba8023e..de2d40a849c 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.h
+++ b/intern/cycles/kernel/shaders/node_fresnel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
  
 float fresnel_dielectric_cos(float cosi, float eta)
@@ -36,14 +36,14 @@ float fresnel_dielectric_cos(float cosi, float eta)
 
 color fresnel_conductor(float cosi, color eta, color k)
 {
-	color cosi2 = color(cosi*cosi);
+	color cosi2 = color(cosi * cosi);
 	color one = color(1, 1, 1);
 	color tmp_f = eta * eta + k * k;
 	color tmp = tmp_f * cosi2;
 	color Rparl2 = (tmp - (2.0 * eta * cosi) + one) /
-					(tmp + (2.0 * eta * cosi) + one);
+	               (tmp + (2.0 * eta * cosi) + one);
 	color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) /
-					(tmp_f + (2.0 * eta * cosi) + cosi2);
+	               (tmp_f + (2.0 * eta * cosi) + cosi2);
 	return (Rparl2 + Rperp2) * 0.5;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_fresnel.osl b/intern/cycles/kernel/shaders/node_fresnel.osl
index 7ef553c0f39..8bec7b432f5 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.osl
+++ b/intern/cycles/kernel/shaders/node_fresnel.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_gamma.osl b/intern/cycles/kernel/shaders/node_gamma.osl
index a2ad3f766fe..bc4c1b34266 100644
--- a/intern/cycles/kernel/shaders/node_gamma.osl
+++ b/intern/cycles/kernel/shaders/node_gamma.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index 7bef2051865..b0bd7692489 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -26,7 +26,8 @@ shader node_geometry(
 	output normal TrueNormal = normal(0.0, 0.0, 0.0),
 	output vector Incoming = vector(0.0, 0.0, 0.0),
 	output point Parametric = point(0.0, 0.0, 0.0),
-	output float Backfacing = 0.0)
+	output float Backfacing = 0.0,
+	output float Pointiness = 0.0)
 {
 	Position = P;
 	Normal = NormalIn;
@@ -49,7 +50,7 @@ shader node_geometry(
 
 	/* try to create spherical tangent from generated coordinates */
 	if (getattribute("geom:generated", generated)) {
-		normal data = normal(-(generated[1]-0.5), (generated[0]-0.5), 0.0);
+		normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0);
 		vector T = transform("object", "world", data);
 		Tangent = cross(Normal, normalize(cross(T, Normal)));
 	}
@@ -57,5 +58,13 @@ shader node_geometry(
 		/* otherwise use surface derivatives */
 		Tangent = normalize(dPdu);
 	}
+
+	getattribute("geom:pointiness", Pointiness);
+	if (bump_offset == "dx") {
+		Pointiness += Dx(Pointiness);
+	}
+	else if (bump_offset == "dy") {
+		Pointiness += Dy(Pointiness);
+	}
 }
 
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index b3d6133553b..68bc107cc5e 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index 5c727ca6917..d3250b32d0b 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_gradient_texture.osl b/intern/cycles/kernel/shaders/node_gradient_texture.osl
index 5aa05917dc2..52b49688ab3 100644
--- a/intern/cycles/kernel/shaders/node_gradient_texture.osl
+++ b/intern/cycles/kernel/shaders/node_gradient_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index 1d1ba1983e1..965d2a3c7f7 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_holdout.osl b/intern/cycles/kernel/shaders/node_holdout.osl
index cafad1b5757..78a9f46fd15 100644
--- a/intern/cycles/kernel/shaders/node_holdout.osl
+++ b/intern/cycles/kernel/shaders/node_holdout.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_hsv.osl b/intern/cycles/kernel/shaders/node_hsv.osl
index 4722bde4cd7..8d9e50fed6b 100644
--- a/intern/cycles/kernel/shaders/node_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -35,6 +35,11 @@ shader node_hsv(
 
 	Color = hsv_to_rgb(Color);
 
+	// Clamp color to prevent negative values cauzed by oversaturation.
+	Color[0] = max(Color[0], 0.0);
+	Color[1] = max(Color[1], 0.0);
+	Color[2] = max(Color[2], 0.0);
+
 	ColorOut = mix(ColorIn, Color, Fac);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 7238a1e8862..d3a347b70db 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -11,15 +11,60 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
 #include "node_color.h"
 
-color image_texture_lookup(string filename, string color_space, float u, float v, output float Alpha, int use_alpha, int is_float, string interpolation)
+point texco_remap_square(point co)
 {
-	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", "periodic", "interp", interpolation, "alpha", Alpha);
+	return (co - point(0.5, 0.5, 0.5)) * 2.0;
+}
+
+point map_to_tube(vector dir)
+{
+	float u, v;
+	v = (dir[2] + 1.0) * 0.5;
+	float len = sqrt(dir[0] * dir[0] + dir[1] * dir[1]);
+	if (len > 0.0) {
+		u = (1.0 - (atan2(dir[0] / len, dir[1] / len) / M_PI)) * 0.5;
+	}
+	else {
+		v = u = 0.0; /* To avoid un-initialized variables. */
+	}
+	return point(u, v, 0.0);
+}
+
+point map_to_sphere(vector dir)
+{
+	float len = length(dir);
+	float v, u;
+	if (len > 0.0) {
+		if (dir[0] == 0.0 && dir[1] == 0.0) {
+			u = 0.0;  /* Othwise domain error. */
+		}
+		else {
+			u = (1.0 - atan2(dir[0], dir[1]) / M_PI) / 2.0;
+		}
+		v = 1.0 - acos(dir[2] / len) / M_PI;
+	}
+	else {
+		v = u = 0.0;  /* To avoid un-initialized variables. */
+	}
+	return point(u, v, 0.0);
+}
+
+color image_texture_lookup(string filename,
+                           string color_space,
+                           float u, float v,
+                           output float Alpha,
+                           int use_alpha,
+                           int is_float,
+                           string interpolation,
+                           string wrap)
+{
+	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", wrap, "interp", interpolation, "alpha", Alpha);
 
 	if (use_alpha) {
 		rgb = color_unpremultiply(rgb, Alpha);
@@ -43,6 +88,7 @@ shader node_image_texture(
 	string color_space = "sRGB",
 	string projection = "Flat",
 	string interpolation = "smartcubic",
+	string wrap = "periodic",
 	float projection_blend = 0.0,
 	int is_float = 1,
 	int use_alpha = 1,
@@ -55,7 +101,14 @@ shader node_image_texture(
 		p = transform(mapping, p);
 	
 	if (projection == "Flat") {
-		Color = image_texture_lookup(filename, color_space, p[0], p[1], Alpha, use_alpha, is_float, interpolation);
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             p[0], p[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
 	}
 	else if (projection == "Box") {
 		/* object space normal */
@@ -113,6 +166,10 @@ shader node_image_texture(
 				weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0);
 			}
 		}
+		else {
+			/* Desperate mode, no valid choice anyway, fallback to one side.*/
+			weight[0] = 1.0;
+		}
 
 		Color = color(0.0, 0.0, 0.0);
 		Alpha = 0.0;
@@ -120,17 +177,59 @@ shader node_image_texture(
 		float tmp_alpha;
 
 		if (weight[0] > 0.0) {
-			Color += weight[0] * image_texture_lookup(filename, color_space, p[1], p[2], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[0] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[1], p[2],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[0] * tmp_alpha;
 		}
 		if (weight[1] > 0.0) {
-			Color += weight[1] * image_texture_lookup(filename, color_space, p[0], p[2], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[1] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[0], p[2],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[1] * tmp_alpha;
 		}
 		if (weight[2] > 0.0) {
-			Color += weight[2] * image_texture_lookup(filename, color_space, p[1], p[0], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[2] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[1], p[0],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[2] * tmp_alpha;
 		}
 	}
+	else if (projection == "Sphere") {
+		point projected = map_to_sphere(texco_remap_square(p));
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             projected[0], projected[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
+	}
+	else if (projection == "Tube") {
+		point projected = map_to_tube(texco_remap_square(p));
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             projected[0], projected[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
+	}
 }
-
diff --git a/intern/cycles/kernel/shaders/node_invert.osl b/intern/cycles/kernel/shaders/node_invert.osl
index 81ef2d0dc3d..b33b0a43d63 100644
--- a/intern/cycles/kernel/shaders/node_invert.osl
+++ b/intern/cycles/kernel/shaders/node_invert.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_layer_weight.osl b/intern/cycles/kernel/shaders/node_layer_weight.osl
index d03ebe2239a..f583df25773 100644
--- a/intern/cycles/kernel/shaders/node_layer_weight.osl
+++ b/intern/cycles/kernel/shaders/node_layer_weight.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_light_falloff.osl b/intern/cycles/kernel/shaders/node_light_falloff.osl
index 311b87f3764..a594e33d643 100644
--- a/intern/cycles/kernel/shaders/node_light_falloff.osl
+++ b/intern/cycles/kernel/shaders/node_light_falloff.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index 95fbcabf917..99a92c4f403 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_magic_texture.osl b/intern/cycles/kernel/shaders/node_magic_texture.osl
index b8afc6e29ac..c09523f205b 100644
--- a/intern/cycles/kernel/shaders/node_magic_texture.osl
+++ b/intern/cycles/kernel/shaders/node_magic_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_mapping.osl b/intern/cycles/kernel/shaders/node_mapping.osl
index 46ff9f05e07..69106957ee4 100644
--- a/intern/cycles/kernel/shaders/node_mapping.osl
+++ b/intern/cycles/kernel/shaders/node_mapping.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index abb6a359e75..7eef97fd7e8 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -93,8 +93,8 @@ shader node_math(
 		Value = Value1 > Value2;
 	else if (type == "Modulo")
 		Value = safe_modulo(Value1, Value2);
-    else if (type == "Absolute")
-        Value = fabs(Value1);
+	else if (type == "Absolute")
+		Value = fabs(Value1);
 
 	if (Clamp)
 		Value = clamp(Value, 0.0, 1.0);
diff --git a/intern/cycles/kernel/shaders/node_mix.osl b/intern/cycles/kernel/shaders/node_mix.osl
index dd54fd814de..9ef58e4cbba 100644
--- a/intern/cycles/kernel/shaders/node_mix.osl
+++ b/intern/cycles/kernel/shaders/node_mix.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_mix_closure.osl b/intern/cycles/kernel/shaders/node_mix_closure.osl
index 79d71c97371..5946dfdaaba 100644
--- a/intern/cycles/kernel/shaders/node_mix_closure.osl
+++ b/intern/cycles/kernel/shaders/node_mix_closure.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index 60762539002..4f95dec910a 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -26,7 +26,7 @@
  * from "Texturing and Modelling: A procedural approach"
  */
 
-float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_fBm(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0;
@@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value += safe_noise(p, 0) * pwr;
+		value += safe_noise(p, "signed") * pwr;
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value += rmd * safe_noise(p, 0) * pwr;
+		value += rmd * safe_noise(p, "signed") * pwr;
 
 	return value;
 }
@@ -54,7 +54,7 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
  * octaves: number of frequencies in the fBm
  */
 
-float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_multi_fractal(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0;
@@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value *= (pwr * safe_noise(p, 0) + 1.0);
+		value *= (pwr * safe_noise(p, "signed") + 1.0);
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */
+		value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */
 
 	return value;
 }
@@ -83,7 +83,7 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacunarity, float octaves, float offset)
+float noise_musgrave_hetero_terrain(point p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = pow(lacunarity, -H);
@@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 	int i;
 
 	/* first unscaled octave of function; later octaves are scaled */
-	value = offset + safe_noise(p, 0);
+	value = offset + safe_noise(p, "signed");
 	p *= lacunarity;
 
 	for (i = 1; i < (int)octaves; i++) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += increment;
 		pwr *= pwHL;
 		p *= lacunarity;
@@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += rmd * increment;
 	}
 
@@ -118,15 +118,15 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_hybrid_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = pow(lacunarity, -H);
 	float pwr = pwHL;
 	int i;
 
-	result = safe_noise(p, 0) + offset;
+	result = safe_noise(p, "signed") + offset;
 	weight = gain * result;
 	p *= lacunarity;
 
@@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 		if (weight > 1.0)
 			weight = 1.0;
 
-		signal = (safe_noise(p, 0) + offset) * pwr;
+		signal = (safe_noise(p, "signed") + offset) * pwr;
 		pwr *= pwHL;
 		result += weight * signal;
 		weight *= gain * signal;
@@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		result += rmd * ((safe_noise(p, 0) + offset) * pwr);
+		result += rmd * ((safe_noise(p, "signed") + offset) * pwr);
 
 	return result;
 }
@@ -156,15 +156,15 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_ridged_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = pow(lacunarity, -H);
 	float pwr = pwHL;
 	int i;
 
-	signal = offset - fabs(safe_noise(p, 0));
+	signal = offset - fabs(safe_noise(p, "signed"));
 	signal *= signal;
 	result = signal;
 	weight = 1.0;
@@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
 	for (i = 1; i < (int)octaves; i++) {
 		p *= lacunarity;
 		weight = clamp(signal * gain, 0.0, 1.0);
-		signal = offset - fabs(safe_noise(p, 0));
+		signal = offset - fabs(safe_noise(p, "signed"));
 		signal *= signal;
 		signal *= weight;
 		result += signal * pwr;
@@ -201,7 +201,6 @@ shader node_musgrave_texture(
 	float dimension = max(Dimension, 1e-5);
 	float octaves = clamp(Detail, 0.0, 16.0);
 	float lacunarity = max(Lacunarity, 1e-5);
-	string Basis = "Perlin";
 	float intensity = 1.0;
 
 	point p = Vector;
@@ -212,15 +211,15 @@ shader node_musgrave_texture(
 	p = p * Scale;
 
 	if (Type == "Multifractal")
-		Fac = intensity * noise_musgrave_multi_fractal(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if (Type == "fBM")
-		Fac = intensity * noise_musgrave_fBm(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if (Type == "Hybrid Multifractal")
-		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Ridged Multifractal")
-		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Hetero Terrain")
-		Fac = intensity * noise_musgrave_hetero_terrain(p, Basis, dimension, lacunarity, octaves, Offset);
+		Fac = intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, Offset);
 	
 	Color = color(Fac, Fac, Fac);
 }
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index 912795966e0..e83e5b5b211 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -19,23 +19,23 @@
 
 /* Noise */
 
-float noise(point p, string basis, float distortion, float detail, float fac, color Color)
+float noise(point p, float distortion, float detail, float fac, color Color)
 {
 	point r;
 	int hard = 0;
 
 	if (distortion != 0.0) {
-		r[0] = noise_basis(p + point(13.5), basis) * distortion;
-		r[1] = noise_basis(p, basis) * distortion;
-		r[2] = noise_basis(p - point(13.5), basis) * distortion;
+		r[0] = safe_noise(p + point(13.5), "unsigned") * distortion;
+		r[1] = safe_noise(p, "unsigned") * distortion;
+		r[2] = safe_noise(p - point(13.5), "unsigned") * distortion;
 		
 		p += r;
 	}
 
-	fac = noise_turbulence(p, basis, detail, hard);
+	fac = noise_turbulence(p, detail, hard);
 	
-	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), basis, detail, hard),
-		noise_turbulence(point(p[1], p[2], p[0]), basis, detail, hard));
+	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), detail, hard),
+		noise_turbulence(point(p[1], p[2], p[0]), detail, hard));
 
 	return fac;
 }
@@ -55,7 +55,6 @@ shader node_noise_texture(
 	if (use_mapping)
 		p = transform(mapping, p);
 
-	string Basis = "Perlin";
-	Fac = noise(p * Scale, Basis, Distortion, Detail, Fac, Color);
+	Fac = noise(p * Scale, Distortion, Detail, Fac, Color);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_normal.osl b/intern/cycles/kernel/shaders/node_normal.osl
index 14af044e0c0..2d04978fc72 100644
--- a/intern/cycles/kernel/shaders/node_normal.osl
+++ b/intern/cycles/kernel/shaders/node_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -23,6 +23,6 @@ shader node_normal(
 	output float Dot = 1.0)
 {
 	NormalOut = normalize(Direction);
-	Dot = dot(NormalOut, NormalIn);
+	Dot = dot(NormalOut, normalize(NormalIn));
 }
 
diff --git a/intern/cycles/kernel/shaders/node_normal_map.osl b/intern/cycles/kernel/shaders/node_normal_map.osl
index c2080ecb194..01be566fb20 100644
--- a/intern/cycles/kernel/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/shaders/node_normal_map.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_object_info.osl b/intern/cycles/kernel/shaders/node_object_info.osl
index 1ebe767e82d..dd7c663b8d8 100644
--- a/intern/cycles/kernel/shaders/node_object_info.osl
+++ b/intern/cycles/kernel/shaders/node_object_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl
index 613d6be5f3b..d0688cfda8d 100644
--- a/intern/cycles/kernel/shaders/node_output_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_output_displacement.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_surface.osl b/intern/cycles/kernel/shaders/node_output_surface.osl
index fb16e85ce0d..2cc4575a8c8 100644
--- a/intern/cycles/kernel/shaders/node_output_surface.osl
+++ b/intern/cycles/kernel/shaders/node_output_surface.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_volume.osl b/intern/cycles/kernel/shaders/node_output_volume.osl
index 11a884b7d75..f220ba866e3 100644
--- a/intern/cycles/kernel/shaders/node_output_volume.osl
+++ b/intern/cycles/kernel/shaders/node_output_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl
index 077b0c114da..768b7753d02 100644
--- a/intern/cycles/kernel/shaders/node_particle_info.osl
+++ b/intern/cycles/kernel/shaders/node_particle_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index 4a32415b482..d458ca730a4 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_rgb_curves.osl b/intern/cycles/kernel/shaders/node_rgb_curves.osl
index 4e0f8721144..60cb273ba98 100644
--- a/intern/cycles/kernel/shaders/node_rgb_curves.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_curves.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_rgb_ramp.osl b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
index d3c2e9573d2..0202ba0bf79 100644
--- a/intern/cycles/kernel/shaders/node_rgb_ramp.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_scatter_volume.osl b/intern/cycles/kernel/shaders/node_scatter_volume.osl
index 77c157bd92b..002e2750fca 100644
--- a/intern/cycles/kernel/shaders/node_scatter_volume.osl
+++ b/intern/cycles/kernel/shaders/node_scatter_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_hsv.osl b/intern/cycles/kernel/shaders/node_separate_hsv.osl
index 94fc5de9122..2a804040294 100644
--- a/intern/cycles/kernel/shaders/node_separate_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_separate_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_rgb.osl b/intern/cycles/kernel/shaders/node_separate_rgb.osl
index aebb63a0ee4..43d9e3aa4b1 100644
--- a/intern/cycles/kernel/shaders/node_separate_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_separate_rgb.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
index 63725cb9995..e1963a1902f 100644
--- a/intern/cycles/kernel/shaders/node_separate_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_set_normal.osl b/intern/cycles/kernel/shaders/node_set_normal.osl
index 8eef152308a..7ca7ac9350c 100644
--- a/intern/cycles/kernel/shaders/node_set_normal.osl
+++ b/intern/cycles/kernel/shaders/node_set_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 85c2dbdb2c2..05eed23bea8 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index 1c0cd74c0be..dbbf657776c 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_tangent.osl b/intern/cycles/kernel/shaders/node_tangent.osl
index 41a2b2b0216..53a47396f9f 100644
--- a/intern/cycles/kernel/shaders/node_tangent.osl
+++ b/intern/cycles/kernel/shaders/node_tangent.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
index de51559f297..fc2cfdcd55c 100644
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ b/intern/cycles/kernel/shaders/node_texture.h
@@ -11,35 +11,9 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
-/* Voronoi Distances */
-
-float voronoi_distance(string distance_metric, vector d, float e)
-{
-#if 0
-	if (distance_metric == "Distance Squared")
-#endif
-		return dot(d, d);
-#if 0
-	if (distance_metric == "Actual Distance")
-		return length(d);
-	if (distance_metric == "Manhattan")
-		return fabs(d[0]) + fabs(d[1]) + fabs(d[2]);
-	if (distance_metric == "Chebychev")
-		return max(fabs(d[0]), max(fabs(d[1]), fabs(d[2])));
-	if (distance_metric == "Minkovsky 1/2")
-		return sqrt(fabs(d[0])) + sqrt(fabs(d[1])) + sqrt(fabs(d[1]));
-	if (distance_metric == "Minkovsky 4")
-		return sqrt(sqrt(dot(d * d, d * d)));
-	if (distance_metric == "Minkovsky")
-		return pow(pow(fabs(d[0]), e) + pow(fabs(d[1]), e) + pow(fabs(d[2]), e), 1.0 / e);
-	
-	return 0.0;
-#endif
-}
-
 /* Voronoi / Worley like */
 
 color cellnoise_color(point p)
@@ -51,7 +25,7 @@ color cellnoise_color(point p)
 	return color(r, g, b);
 }
 
-void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
+void voronoi(point p, float e, float da[4], point pa[4])
 {
 	/* returns distances in da and point coords in pa */
 	int xx, yy, zz, xi, yi, zi;
@@ -71,7 +45,7 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 				point ip = point(xx, yy, zz);
 				point vp = (point)cellnoise_color(ip);
 				point pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
+				float d = dot(pd, pd);
 
 				vp += point(xx, yy, zz);
 
@@ -111,54 +85,14 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 	}
 }
 
-float voronoi_Fn(point p, int n)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n];
-}
-
-float voronoi_FnFn(point p, int n1, int n2)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n2] - da[n1];
-}
-
-float voronoi_F1(point p) { return voronoi_Fn(p, 0); }
-float voronoi_F2(point p) { return voronoi_Fn(p, 1); }
-float voronoi_F3(point p) { return voronoi_Fn(p, 2); }
-float voronoi_F4(point p) { return voronoi_Fn(p, 3); }
-float voronoi_F1F2(point p) { return voronoi_FnFn(p, 0, 1); }
-
-float voronoi_Cr(point p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0 * voronoi_F1F2(p);
-	return (t > 1.0) ? 1.0 : t;
-}
-
-float voronoi_F1S(point p) { return 2.0 * voronoi_F1(p) - 1.0; }
-float voronoi_F2S(point p) { return 2.0 * voronoi_F2(p) - 1.0; }
-float voronoi_F3S(point p) { return 2.0 * voronoi_F3(p) - 1.0; }
-float voronoi_F4S(point p) { return 2.0 * voronoi_F4(p) - 1.0; }
-float voronoi_F1F2S(point p) { return 2.0 * voronoi_F1F2(p) - 1.0; }
-float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; }
-
 /* Noise Bases */
 
-float safe_noise(point p, int type)
+float safe_noise(point p, string type)
 {
 	float f = 0.0;
 	
 	/* Perlin noise in range -1..1 */
-	if (type == 0)
+	if (type == "signed")
 		f = noise("perlin", p);
 	
 	/* Perlin noise in range 0..1 */
@@ -172,39 +106,9 @@ float safe_noise(point p, int type)
 	return f;
 }
 
-float noise_basis(point p, string basis)
-{
-	if (basis == "Perlin")
-		return safe_noise(p, 1);
-	if (basis == "Voronoi F1")
-		return voronoi_F1S(p);
-	if (basis == "Voronoi F2")
-		return voronoi_F2S(p);
-	if (basis == "Voronoi F3")
-		return voronoi_F3S(p);
-	if (basis == "Voronoi F4")
-		return voronoi_F4S(p);
-	if (basis == "Voronoi F2-F1")
-		return voronoi_F1F2S(p);
-	if (basis == "Voronoi Crackle")
-		return voronoi_CrS(p);
-	if (basis == "Cell Noise")
-		return cellnoise(p);
-	
-	return 0.0;
-}
-
-/* Soft/Hard Noise */
-
-float noise_basis_hard(point p, string basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard) ? fabs(2.0 * t - 1.0) : t;
-}
-
 /* Turbulence */
 
-float noise_turbulence(point p, string basis, float details, int hard)
+float noise_turbulence(point p, float details, int hard)
 {
 	float fscale = 1.0;
 	float amp = 1.0;
@@ -215,7 +119,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	n = (int)octaves;
 
 	for (i = 0; i <= n; i++) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
@@ -228,7 +132,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	float rmd = octaves - floor(octaves);
 
 	if (rmd != 0.0) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
diff --git a/intern/cycles/kernel/shaders/node_texture_coordinate.osl b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
index 8fdf469df21..9e2109fa082 100644
--- a/intern/cycles/kernel/shaders/node_texture_coordinate.osl
+++ b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -21,7 +21,9 @@ shader node_texture_coordinate(
 	int is_background = 0,
 	int is_volume = 0,
 	int from_dupli = 0,
+	int use_transform = 0,
 	string bump_offset = "center",
+	matrix object_itfm = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 
 	output point Generated = point(0.0, 0.0, 0.0),
 	output point UV = point(0.0, 0.0, 0.0),
@@ -60,7 +62,12 @@ shader node_texture_coordinate(
 			getattribute("geom:uv", UV);
 		}
 
-		Object = transform("object", P);
+		if (use_transform) {
+			Object = transform(object_itfm, P);
+		}
+		else {
+			Object = transform("object", P);
+		}
 		Camera = transform("camera", P);
 		Window = transform("NDC", P);
 		Normal = transform("world", "object", NormalIn);
diff --git a/intern/cycles/kernel/shaders/node_toon_bsdf.osl b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
index 1f7e1b8e6e1..75c5d06f847 100644
--- a/intern/cycles/kernel/shaders/node_toon_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
index 8059f5788ec..94d23d35326 100644
--- a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
index 552e4106b0c..5d6798f19a6 100644
--- a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_uv_map.osl b/intern/cycles/kernel/shaders/node_uv_map.osl
index 01c984aff4c..77e2e8d12d7 100644
--- a/intern/cycles/kernel/shaders/node_uv_map.osl
+++ b/intern/cycles/kernel/shaders/node_uv_map.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_value.osl b/intern/cycles/kernel/shaders/node_value.osl
index aebfab35d2a..f75388d1f76 100644
--- a/intern/cycles/kernel/shaders/node_value.osl
+++ b/intern/cycles/kernel/shaders/node_value.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_curves.osl b/intern/cycles/kernel/shaders/node_vector_curves.osl
index 137ebe112eb..7bbf97d95ea 100644
--- a/intern/cycles/kernel/shaders/node_vector_curves.osl
+++ b/intern/cycles/kernel/shaders/node_vector_curves.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_math.osl b/intern/cycles/kernel/shaders/node_vector_math.osl
index 0c8857deae2..f83412dc0f7 100644
--- a/intern/cycles/kernel/shaders/node_vector_math.osl
+++ b/intern/cycles/kernel/shaders/node_vector_math.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_transform.osl b/intern/cycles/kernel/shaders/node_vector_transform.osl
index 6fb0ab1d8cc..8ebaa31ab25 100644
--- a/intern/cycles/kernel/shaders/node_vector_transform.osl
+++ b/intern/cycles/kernel/shaders/node_vector_transform.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
index 37b26babc64..456c26998c8 100644
--- a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
index 7a1e0016690..29e143ae207 100644
--- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -37,7 +37,7 @@ shader node_voronoi_texture(
 	float da[4];
 	point pa[4];
 
-	voronoi(p * Scale, "Distance Squared", 1.0, da, pa);
+	voronoi(p * Scale, 1.0, da, pa);
 
 	/* Colored output */
 	if (Coloring == "Intensity") {
diff --git a/intern/cycles/kernel/shaders/node_voxel_texture.osl b/intern/cycles/kernel/shaders/node_voxel_texture.osl
new file mode 100644
index 00000000000..e45af62220f
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_voxel_texture.osl
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_voxel_texture(
+	string filename = "",
+	string interpolation = "linear",
+	int use_mapping = 0,
+	matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+	point Vector = P,
+	output float Density = 0,
+	output color Color = 0)
+{
+	point p = Vector;
+	if (use_mapping) {
+		p = transform(mapping, p);
+	}
+	else {
+		p = transform("object", Vector);
+		matrix tfm;
+		if (getattribute("geom:generated_transform", tfm))
+			p = transform(tfm, p);
+	}
+	if(p[0] < 0.0 || p[1] < 0.0 || p[2] < 0.0 ||
+	   p[0] > 1.0 || p[1] > 1.0 || p[2] > 1.0)
+	{
+		Density = 0;
+		Color = color(0, 0, 0);
+	}
+	else {
+		Color = (color)texture3d(filename, p, "wrap", "periodic", "interp", interpolation, "alpha", Density);
+	}
+}
diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl
index ba40207b446..569f284cbac 100644
--- a/intern/cycles/kernel/shaders/node_wave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_wave_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -31,7 +31,7 @@ float wave(point p, string type, float detail, float distortion, float dscale)
 	}
 
 	if (distortion != 0.0) {
-		n = n + (distortion * noise_turbulence(p * dscale, "Perlin", detail, 0));
+		n = n + (distortion * noise_turbulence(p * dscale, detail, 0));
 	}
 	return 0.5 + 0.5 * sin(n);
 }
diff --git a/intern/cycles/kernel/shaders/node_wavelength.osl b/intern/cycles/kernel/shaders/node_wavelength.osl
index 4333c1fd944..79e7043d4bf 100644
--- a/intern/cycles/kernel/shaders/node_wavelength.osl
+++ b/intern/cycles/kernel/shaders/node_wavelength.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_wireframe.osl b/intern/cycles/kernel/shaders/node_wireframe.osl
index db8925c9efc..5cc214495dd 100644
--- a/intern/cycles/kernel/shaders/node_wireframe.osl
+++ b/intern/cycles/kernel/shaders/node_wireframe.osl
@@ -11,17 +11,31 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
 #include "oslutil.h"
 
 shader node_wireframe(
+	string bump_offset = "center",
 	int use_pixel_size = 0,
 	float Size = 0.01,
 	output float Fac = 0.0)
 {
 	Fac = wireframe("triangles", Size, use_pixel_size);
+	/* TODO(sergey): Since we can't use autodiff here we do algebraic
+	 * calculation of derivatives by definition. We could probably
+	 * optimize this a bit by doing some extra calculation in wireframe().
+	 */
+	if (bump_offset == "dx") {
+		point dx = Dx(P);
+		P -= dx;
+		Fac += (Fac - wireframe("triangles", Size, use_pixel_size)) / length(dx);
+	}
+	else if (bump_offset == "dy") {
+		point dy = Dy(P);
+		P -= dy;
+		Fac += (Fac - wireframe("triangles", Size, use_pixel_size)) / length(dy);
+	}
 }
-
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index f8e5fd510ee..697a1756119 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -249,7 +249,21 @@ point rotate (point p, float angle, point a, point b)
 {
     vector axis = normalize (b - a);
     float cosang, sinang;
+    /* Older OSX has major issues with sincos() function,
+     * it's likely a big in OSL or LLVM. For until we've
+     * updated to new versions of this libraries we'll
+     * use a workaround to prevent possible crashes on all
+     * the platforms.
+     *
+     * Shouldn't be that bad because it's mainly used for
+     * anisotropic shader where angle is usually constant.
+     */
+#if 0
     sincos (angle, sinang, cosang);
+#else
+    sinang = sin (angle);
+    cosang = cos (angle);
+#endif
     float cosang1 = 1.0 - cosang;
     float x = axis[0], y = axis[1], z = axis[2];
     matrix M = matrix (x * x + (1.0 - x * x) * cosang,
@@ -476,8 +490,6 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
 closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
 closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
 closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color westin_backscatter(normal N, float roughness) BUILTIN;
-closure color westin_sheen(normal N, float edginess) BUILTIN;
 closure color translucent(normal N) BUILTIN;
 closure color reflection(normal N) BUILTIN;
 closure color refraction(normal N, float eta) BUILTIN;
@@ -507,6 +519,47 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
+// OSL 1.5 Microfacet functions
+closure color microfacet(string distribution, normal N, vector U, float xalpha, float yalpha, float eta, int refract) {
+	/* GGX */
+	if (distribution == "ggx" || distribution == "default") {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_ggx(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_ggx_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_ggx_refraction(N, xalpha, eta);
+		}
+	}
+	/* Beckmann */
+	else {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_beckmann(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_beckmann_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_beckmann_refraction(N, xalpha, eta);
+		}
+	}
+}
+
+closure color microfacet (string distribution, normal N, float alpha, float eta, int refract) {
+	return microfacet(distribution, N, vector(0), alpha, alpha, eta, refract);
+}
+
+
 // Renderer state
 int backfacing () BUILTIN;
 int raytype (string typename) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
new file mode 100644
index 00000000000..0132ef9c2f2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                      |--- ray_state
+ * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
+ * parallel_samples -------------------------------------|                                      |--- PathState_coop
+ * end_sample -------------------------------------------|                                      |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                      |--- rng_coop
+ * rng_state --------------------------------------------|                                      |--- Ray
+ * PathRadiance_coop ------------------------------------|                                      |
+ * sw ---------------------------------------------------|                                      |
+ * sh ---------------------------------------------------|                                      |
+ * sx ---------------------------------------------------|                                      |
+ * sy ---------------------------------------------------|                                      |
+ * stride -----------------------------------------------|                                      |
+ * work_array -------------------------------------------|                                      |--- work_array
+ * queuesize --------------------------------------------|                                      |
+ * start_sample -----------------------------------------|                                      |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                      |
+ * num_samples ------------------------------------------|                                      |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+ccl_device char kernel_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData strucuture */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	ccl_global PathState *state = &PathState_coop[ray_index];
+	PathRadiance *L = L = &PathRadiance_coop[ray_index];
+	ccl_global Ray *ray = &Ray_coop[ray_index];
+	ccl_global float3 *throughput = &throughput_coop[ray_index];
+	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+	ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	ccl_global float *initial_per_sample_output_buffers;
+	ccl_global uint *initial_rng;
+#endif
+	unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+	my_work = work_array[ray_index];
+	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+	get_pixel_tile_position(&pixel_x, &pixel_y,
+	                        &tile_x, &tile_y,
+	                        my_work,
+	                        sw, sh, sx, sy,
+	                        parallel_samples,
+	                        ray_index);
+	my_sample_tile = 0;
+	initial_per_sample_output_buffers = per_sample_output_buffers;
+	initial_rng = rng_state;
+#else  /* __WORK_STEALING__ */
+	sample = work_array[ray_index];
+	int tile_index = ray_index / parallel_samples;
+	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+	tile_x = tile_index % sw;
+	tile_y = tile_index / sw;
+	my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+
+	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		/* eval background shader if nothing hit */
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+			*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, state, ray, sd);
+			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+		/* accumulate result in output buffer */
+		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		path_rng_end(kg, rng_state, *rng);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+		/* We have completed current work; So get next work */
+		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		if(!valid_work) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#else  /* __WORK_STEALING__ */
+		if((sample + parallel_samples) >= end_sample) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#endif  /* __WORK_STEALING__ */
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+			work_array[ray_index] = my_work;
+			/* Get the sample associated with the current work */
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			/* Get pixel and tile position associated with current work */
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+			my_sample_tile = 0;
+
+			/* Remap rng_state according to the current work */
+			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+			/* Remap per_sample_output_buffers according to the current work */
+			per_sample_output_buffers = initial_per_sample_output_buffers
+				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else  /* __WORK_STEALING__ */
+			work_array[ray_index] = sample + parallel_samples;
+			sample = work_array[ray_index];
+
+			/* Get ray position from ray index */
+			pixel_x = sx + ((ray_index / parallel_samples) % sw);
+			pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif  /* __WORK_STEALING__ */
+
+			/* Initialize random numbers and ray. */
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, L_transparent, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*L_transparent = 0.0f;
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+				debug_data_init(debug_data);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			} else {
+				/* These rays do not participate in path-iteration. */
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				/* Accumulate result in output buffer. */
+				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				path_rng_end(kg, rng_state, *rng);
+
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
new file mode 100644
index 00000000000..4dab79a5c67
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                  |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                  |--- Initialized Ray
+ * Un-initialized PathState --------|                                  |--- Initialized PathState
+ * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                  |--- Initialized ray_state
+ * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                  |--- Initialized work_array
+ * data ----------------------------|                                  |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                  |
+ * sx ------------------------------|                                  |
+ * sy ------------------------------|                                  |
+ * sw ------------------------------|                                  |
+ * sh ------------------------------|                                  |
+ * stride --------------------------|                                  |
+ * queuesize -----------------------|                                  |
+ * num_samples ---------------------|                                  |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+ccl_device void kernel_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+	kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../kernel_textures.h"
+
+	/* Load ShaderData structure */
+	ShaderData *sd = (ShaderData *)shader_data_sd;
+	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+	sd->P = P_sd;
+	sd_DL_shadow->P = P_sd_DL_shadow;
+
+	sd->N = N_sd;
+	sd_DL_shadow->N = N_sd_DL_shadow;
+
+	sd->Ng = Ng_sd;
+	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+	sd->I = I_sd;
+	sd_DL_shadow->I = I_sd_DL_shadow;
+
+	sd->shader = shader_sd;
+	sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+	sd->flag = flag_sd;
+	sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+	sd->prim = prim_sd;
+	sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+	sd->type = type_sd;
+	sd_DL_shadow->type = type_sd_DL_shadow;
+
+	sd->u = u_sd;
+	sd_DL_shadow->u = u_sd_DL_shadow;
+
+	sd->v = v_sd;
+	sd_DL_shadow->v = v_sd_DL_shadow;
+
+	sd->object = object_sd;
+	sd_DL_shadow->object = object_sd_DL_shadow;
+
+	sd->time = time_sd;
+	sd_DL_shadow->time = time_sd_DL_shadow;
+
+	sd->ray_length = ray_length_sd;
+	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+	sd->ray_depth = ray_depth_sd;
+	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+	sd->transparent_depth = transparent_depth_sd;
+	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+	sd->dP = dP_sd;
+	sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+	sd->dI = dI_sd;
+	sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+	sd->du = du_sd;
+	sd_DL_shadow->du = du_sd_DL_shadow;
+
+	sd->dv = dv_sd;
+	sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+	sd->dPdu = dPdu_sd;
+	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+	sd->dPdv = dPdv_sd;
+	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+#ifdef __OBJECT_MOTION__
+	sd->ob_tfm = ob_tfm_sd;
+	sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
+
+	sd->ob_itfm = ob_itfm_sd;
+	sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
+#endif
+
+	sd->closure = closure_sd;
+	sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+	sd->num_closure = num_closure_sd;
+	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+	sd->randb_closure = randb_closure_sd;
+	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+	sd->ray_P = ray_P_sd;
+	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+	sd->ray_dP = ray_dP_sd;
+	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	/* Initialize work_pool_wgs */
+	if(lid == 0) {
+		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+		work_pool_wgs[group_index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif  /* __WORK_STEALING__ */
+
+	/* Initialize queue data and queue index. */
+	if(thread_index < queuesize) {
+		/* Initialize active ray queue. */
+		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize background and buffer update queue. */
+		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of AO queue. */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of direct lighting queue. */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	if(thread_index == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		use_queues_flag[0] = 0;
+	}
+
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < (sw * parallel_samples) && y < sh) {
+		int ray_index = x + y * (sw * parallel_samples);
+
+		/* This is the first assignment to ray_state;
+		 * So we dont use ASSIGN_RAY_STATE macro.
+		 */
+		ray_state[ray_index] = RAY_ACTIVE;
+
+		unsigned int my_sample;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work = 0;
+		/* Get work. */
+		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		/* Get the sample associated with the work. */
+		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+		my_sample_tile = 0;
+
+		/* Get pixel and tile position associated with the work. */
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		work_array[ray_index] = my_work;
+#else  /* __WORK_STEALING__ */
+		unsigned int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample = my_sample_tile + start_sample;
+
+		/* Initialize work array. */
+		work_array[ray_index] = my_sample ;
+
+		/* Calculate pixel position of this ray. */
+		pixel_x = sx + tile_x;
+		pixel_y = sy + tile_y;
+#endif  /* __WORK_STEALING__ */
+
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+		/* Initialise per_sample_output_buffers to all zeros. */
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+		int per_sample_output_buffers_iterator = 0;
+		for(per_sample_output_buffers_iterator = 0;
+		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
+		    per_sample_output_buffers_iterator++)
+		{
+			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+		}
+
+		/* Initialize random numbers and ray. */
+		kernel_path_trace_setup(kg,
+		                        rng_state,
+		                        my_sample,
+		                        pixel_x, pixel_y,
+		                        &rng_coop[ray_index],
+		                        &Ray_coop[ray_index]);
+
+		if(Ray_coop[ray_index].t != 0.0f) {
+			/* Initialize throughput, L_transparent, Ray, PathState;
+			 * These rays proceed with path-iteration.
+			 */
+			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+			L_transparent_coop[ray_index] = 0.0f;
+			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+			path_state_init(kg,
+			                &PathState_coop[ray_index],
+			                &rng_coop[ray_index],
+			                my_sample,
+			                &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+			debug_data_init(&debugdata_coop[ray_index]);
+#endif
+		} else {
+			/* These rays do not participate in path-iteration. */
+			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			/* Accumulate result in output buffer. */
+			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+			path_rng_end(kg, rng_state, rng_coop[ray_index]);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+	}
+
+	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
+	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
new file mode 100644
index 00000000000..50c83d06140
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                             |--- ISLamp_coop
+ * shader_data --------------------------------------|                             |--- LightRay_coop
+ * ray_state ----------------------------------------|                             |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
+ * kg (globals + data) ------------------------------|                             |
+ * queuesize ----------------------------------------|                             |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+ccl_device char kernel_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* Load kernel globals structure and ShaderData structure. */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+
+		/* direct lighting */
+#ifdef __EMISSION__
+		if((kernel_data.integrator.use_direct_light &&
+		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+		{
+			/* Sample illumination from lights to find path contribution. */
+			ccl_global RNG* rng = &rng_coop[ray_index];
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_u, light_v;
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+			LightSample ls;
+			light_sample(kg,
+			             light_t, light_u, light_v,
+			             ccl_fetch(sd, time),
+			             ccl_fetch(sd, P),
+			             state->bounce,
+			             &ls);
+
+			Ray light_ray;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+
+			BsdfEval L_light;
+			bool is_lamp;
+			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp,
+			                   state->bounce, state->transparent_bounce, sd_DL))
+			{
+				/* Write intermediate data to global memory to access from
+				 * the next kernel.
+				 */
+				LightRay_coop[ray_index] = light_ray;
+				BSDFEval_coop[ray_index] = L_light;
+				ISLamp_coop[ray_index] = is_lamp;
+				/* Mark ray state for next shadow kernel. */
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+				enqueue_flag = 1;
+			}
+		}
+#endif  /* __EMISSION__ */
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
new file mode 100644
index 00000000000..a75523a3e53
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                           |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                           |--- ShaderData
+ * ray_state --------------------------------------------|                                                           |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                           |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                           |
+ * sw ---------------------------------------------------|                                                           |
+ * sh ---------------------------------------------------|                                                           |
+ * sx ---------------------------------------------------|                                                           |
+ * sy ---------------------------------------------------|                                                           |
+ * stride -----------------------------------------------|                                                           |
+ * work_array -------------------------------------------|                                                           |
+ * queuesize --------------------------------------------|                                                           |
+ * start_sample -----------------------------------------|                                                           |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index,
+        char *enqueue_flag,
+        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+{
+	/* Load kernel globals structure and ShaderData structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+#endif
+	unsigned int tile_x;
+	unsigned int tile_y;
+	int my_sample_tile;
+	unsigned int sample;
+
+	ccl_global RNG *rng = 0x0;
+	ccl_global PathState *state = 0x0;
+	float3 throughput;
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+		throughput = throughput_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		my_sample_tile = 0;
+#else  /* __WORK_STEALING__ */
+		sample = work_array[ray_index];
+		/* Buffer's stride is "stride"; Find x and y using ray_index. */
+		int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+		per_sample_output_buffers +=
+		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
+		    kernel_data.film.pass_stride;
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) &&
+		   (state->flag & PATH_RAY_CAMERA))
+		{
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+			}
+
+			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				*enqueue_flag = 1;
+			}
+		}
+#endif  /* __HOLDOUT__ */
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+		/* Holdout mask objects do not write data passes. */
+		kernel_write_data_passes(kg,
+		                         per_sample_output_buffers,
+		                         L,
+		                         sd,
+		                         sample,
+		                         state,
+		                         throughput);
+		/* Blurring of bsdf after bounces, for rays that have a small likelihood
+		 * of following this particular path (diffuse, rough glossy.
+		 */
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+			if(blur_pdf < 1.0f) {
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				shader_bsdf_blur(kg, sd, blur_roughness);
+			}
+		}
+
+#ifdef __EMISSION__
+		/* emission */
+		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
+			float3 emission = indirect_primitive_emission(
+			        kg,
+			        sd,
+			        Intersection_coop[ray_index].t,
+			        state->flag,
+			        state->ray_pdf);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		}
+#endif  /* __EMISSION__ */
+
+		/* Path termination. this is a strange place to put the termination, it's
+		 * mainly due to the mixed in MIS that we use. gives too many unneeded
+		 * shader evaluations, only need emission if we are going to terminate.
+		 */
+		float probability = path_state_terminate_probability(kg, state, throughput);
+
+		if(probability == 0.0f) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			*enqueue_flag = 1;
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				if(terminate >= probability) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					*enqueue_flag = 1;
+				} else {
+					throughput_coop[ray_index] = throughput/probability;
+				}
+			}
+		}
+	}
+
+#ifdef __AO__
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion ||
+		   (ccl_fetch(sd, flag) & SD_AO))
+		{
+			/* todo: solve correlation */
+			float bsdf_u, bsdf_v;
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+			float ao_factor = kernel_data.background.ao_factor;
+			float3 ao_N;
+			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+			float3 ao_D;
+			float ao_pdf;
+			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+				Ray _ray;
+				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.D = ao_D;
+				_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+				_ray.time = ccl_fetch(sd, time);
+#endif
+				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dD = differential3_zero();
+				AOLightRay_coop[ray_index] = _ray;
+
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+			}
+		}
+	}
+#endif  /* __AO__ */
+}
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
new file mode 100644
index 00000000000..a8e4b0a06c8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                           |
+ * Intersection_coop ----------------------------------|                           |
+ * ray_state ------------------------------------------|                           |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
+ * queuesize ------------------------------------------|                           |
+ * use_queues_flag ------------------------------------|                           |
+ * sw -------------------------------------------------|                           |
+ * sh -------------------------------------------------|                           |
+ * parallel_samples -----------------------------------|                           |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel.
+ */
+ccl_device void kernel_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+	{
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+
+		float3 throughput = throughput_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+		PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+			/* ray starting from previous non-transparent bounce */
+			Ray light_ray;
+
+			light_ray.P = ray.P - state.ray_t*ray.D;
+			state.ray_t += Intersection_coop[ray_index].t;
+			light_ray.D = ray.D;
+			light_ray.t = state.ray_t;
+			light_ray.time = ray.time;
+			light_ray.dD = ray.dD;
+			light_ray.dP = ray.dP;
+			/* intersect with lamp */
+			float3 emission;
+
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			}
+		}
+#endif  /* __LAMP_MIS__ */
+
+		/* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif  /* __VOLUME_DECOUPLED__ */
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif  /* __VOLUME_SCATTER__ */
+			}
+		}
+#endif  /* __VOLUME__ */
+#endif
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
new file mode 100644
index 00000000000..e1a1577d7ae
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                 |--- PathState_coop
+ * ray_state --------------------------------------------|                                 |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                 |
+ * kg (globals + data) ----------------------------------|                                 |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+ccl_device char kernel_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData structure. */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+	PathRadiance *L = 0x0;
+	ccl_global PathState *state = 0x0;
+
+	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	 {
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+		float3 _throughput = throughput_coop[ray_index];
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			float3 shadow = LightRay_ao_coop[ray_index].P;
+			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			if(update_path_radiance) {
+				path_radiance_accum_ao(L,
+				                       _throughput,
+				                       AOAlpha_coop[ray_index],
+				                       AOBSDF_coop[ray_index],
+				                       shadow,
+				                       state->bounce);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+			float3 shadow = LightRay_dl_coop[ray_index].P;
+			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			if(update_path_radiance) {
+				BsdfEval L_light = BSDFEval_coop[ray_index];
+				path_radiance_accum_light(L,
+				                          _throughput,
+				                          &L_light,
+				                          shadow,
+				                          1.0f,
+				                          state->bounce,
+				                          ISLamp_coop[ray_index]);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global float3 *throughput = &throughput_coop[ray_index];
+		ccl_global Ray *ray = &Ray_coop[ray_index];
+		ccl_global RNG* rng = &rng_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+
+		/* Compute direct lighting and next bounce. */
+		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			enqueue_flag = 1;
+		}
+	}
+
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
new file mode 100644
index 00000000000..7eb201ecf32
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------|                                          |--- Intersection
+ * ray_state --------------------------------------|                                          |--- ray_state
+ * use_queues_flag --------------------------------|                                          |
+ * parallel_samples -------------------------------|                                          |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
+ * kg (data + globals) ----------------------------|                                          |
+ * rng_coop ---------------------------------------|                                          |
+ * sw ---------------------------------------------|                                          |
+ * sh ---------------------------------------------|                                          |
+ * queuesize --------------------------------------|                                          |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_scene_intersect to work on queues.
+ * But during the very first time, the queues will be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+ccl_device void kernel_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	/* All regenerated rays become active here */
+	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+		return;
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	Intersection *isect = &Intersection_coop[ray_index];
+	PathState state = PathState_coop[ray_index];
+	Ray ray = Ray_coop[ray_index];
+
+	/* intersect scene */
+	uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+	RNG rng = rng_coop[ray_index];
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray.dD.dx + ray.dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	if(state.flag & PATH_RAY_CAMERA) {
+		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
+	}
+	debug_data->num_ray_bounces++;
+#endif
+
+	if(!hit) {
+		/* Change the state of rays that hit the background;
+		 * These rays undergo special processing in the
+		 * background_bufferUpdate kernel.
+		 */
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
new file mode 100644
index 00000000000..e6fdc592586
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shader_eval kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data
+ * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------|                         |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
+ * ray_state ------------------------------------------|                         |
+ * kg (globals + data) --------------------------------|                         |
+ * queuesize ------------------------------------------|                         |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+ccl_device void kernel_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		Intersection *isect = &Intersection_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      sd,
+		                      isect,
+		                      &ray,
+		                      state->bounce,
+		                      state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
new file mode 100644
index 00000000000..28351c2b1ae
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO  part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
+ * ray_state ---------------------------------------|                            |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * kg (globals + data) -----------------------------|                            |
+ * queuesize ---------------------------------------|                            |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+ccl_device void kernel_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        int total_num_rays,
+        char shadow_blocked_type,
+        int ray_index)
+{
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	{
+		/* Load kernel global structure. */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+		ccl_global Ray *light_ray_global =
+		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
+		                 ? light_ray_ao_global
+		                 : light_ray_dl_global;
+		Intersection *isect_global =
+		        RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+		float3 shadow;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        state,
+		                                        light_ray_global,
+		                                        &shadow,
+		                                        sd_shadow,
+		                                        isect_global));
+
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
new file mode 100644
index 00000000000..e1c7e2cea99
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  __KERNEL_SPLIT_H__
+#define  __KERNEL_SPLIT_H__
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+#include "util_atomic.h"
+
+#include "kernel_random.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
+#include "kernel_shader.h"
+#include "kernel_light.h"
+#include "kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "kernel_volume.h"
+#endif
+
+#include "kernel_path_state.h"
+#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_common.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
+#include "kernel_queues.h"
+#include "kernel_work_stealing.h"
+
+#endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
new file mode 100644
index 00000000000..a21e9b6a0b1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+
+/* Since we process various samples in parallel; The output radiance of different samples
+ * are stored in different locations; This kernel combines the output radiance contributed
+ * by all different samples and stores them in the RenderTile's output buffer.
+ */
+ccl_device void kernel_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < sw && y < sh) {
+		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+		int sample_stride = (data->film.pass_stride);
+
+		int sample_iterator = 0;
+		int pass_stride_iterator = 0;
+		int num_floats = data->film.pass_stride;
+
+		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+				*(buffer + pass_stride_iterator) =
+				        (start_sample == 0 && sample_iterator == 0)
+				                ? *(per_sample_output_buffer + pass_stride_iterator)
+				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+			}
+			per_sample_output_buffer += sample_stride;
+		}
+	}
+}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index efbffacf375..84fc0fcf587 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_H__
@@ -102,7 +102,7 @@ ccl_device_inline int stack_load_int(float *stack, uint a)
 	return __float_as_int(stack[a]);
 }
 
-ccl_device_inline float stack_load_int_default(float *stack, uint a, uint value)
+ccl_device_inline int stack_load_int_default(float *stack, uint a, uint value)
 {
 	return (a == (uint)SVM_STACK_INVALID)? (int)value: stack_load_int(stack, a);
 }
@@ -157,6 +157,8 @@ CCL_NAMESPACE_END
 #include "svm_noise.h"
 #include "svm_texture.h"
 
+#include "svm_math_util.h"
+
 #include "svm_attribute.h"
 #include "svm_gradient.h"
 #include "svm_blackbody.h"
@@ -192,20 +194,24 @@ CCL_NAMESPACE_END
 #include "svm_checker.h"
 #include "svm_brick.h"
 #include "svm_vector_transform.h"
+#include "svm_voxel.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Main Interpreter Loop */
+#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
+#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
 
+/* Main Interpreter Loop */
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = sd->shader & SHADER_MASK;
+	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
 
 		switch(node.x) {
+#if NODES_GROUP(NODE_GROUP_LEVEL_0)
 			case NODE_SHADER_JUMP: {
 				if(type == SHADER_TYPE_SURFACE) offset = node.y;
 				else if(type == SHADER_TYPE_VOLUME) offset = node.z;
@@ -222,15 +228,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_CLOSURE_BACKGROUND:
 				svm_node_closure_background(sd, stack, node);
 				break;
-			case NODE_CLOSURE_HOLDOUT:
-				svm_node_closure_holdout(sd, stack, node);
-				break;
-			case NODE_CLOSURE_AMBIENT_OCCLUSION:
-				svm_node_closure_ambient_occlusion(sd, stack, node);
-				break;
-			case NODE_CLOSURE_VOLUME:
-				svm_node_closure_volume(kg, sd, stack, node, path_flag);
-				break;
 			case NODE_CLOSURE_SET_WEIGHT:
 				svm_node_closure_set_weight(sd, node.y, node.z, node.w);
 				break;
@@ -251,13 +248,137 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				if(stack_load_float(stack, node.z) == 1.0f)
 					offset += node.y;
 				break;
-#ifdef __TEXTURES__
+			case NODE_GEOMETRY:
+				svm_node_geometry(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_CONVERT:
+				svm_node_convert(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_TEX_COORD:
+				svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_VALUE_F:
+				svm_node_value_f(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_VALUE_V:
+				svm_node_value_v(kg, sd, stack, node.y, &offset);
+				break;
+			case NODE_ATTR:
+				svm_node_attr(kg, sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_GEOMETRY_BUMP_DX:
+				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_GEOMETRY_BUMP_DY:
+				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_SET_DISPLACEMENT:
+				svm_node_set_displacement(sd, stack, node.y);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+#  ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
 				svm_node_tex_image(kg, sd, stack, node);
 				break;
 			case NODE_TEX_IMAGE_BOX:
 				svm_node_tex_image_box(kg, sd, stack, node);
 				break;
+			case NODE_TEX_NOISE:
+				svm_node_tex_noise(kg, sd, stack, node, &offset);
+				break;
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+#    if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_SET_BUMP:
+				svm_node_set_bump(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DX:
+				svm_node_attr_bump_dx(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DY:
+				svm_node_attr_bump_dy(kg, sd, stack, node);
+				break;
+			case NODE_TEX_COORD_BUMP_DX:
+				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_TEX_COORD_BUMP_DY:
+				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_CLOSURE_SET_NORMAL:
+				svm_node_set_normal(kg, sd, stack, node.y, node.z);
+				break;
+#    endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+			case NODE_HSV:
+				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_1)
+			case NODE_CLOSURE_HOLDOUT:
+				svm_node_closure_holdout(sd, stack, node);
+				break;
+			case NODE_CLOSURE_AMBIENT_OCCLUSION:
+				svm_node_closure_ambient_occlusion(sd, stack, node);
+				break;
+			case NODE_FRESNEL:
+				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LAYER_WEIGHT:
+				svm_node_layer_weight(sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
+			case NODE_CLOSURE_VOLUME:
+				svm_node_closure_volume(kg, sd, stack, node, path_flag);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+#  ifdef __EXTRA_NODES__
+			case NODE_MATH:
+				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_VECTOR_MATH:
+				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_RGB_RAMP:
+				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+				break;
+			case NODE_GAMMA:
+				svm_node_gamma(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_BRIGHTCONTRAST:
+				svm_node_brightness(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LIGHT_PATH:
+				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
+				break;
+			case NODE_OBJECT_INFO:
+				svm_node_object_info(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_PARTICLE_INFO:
+				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+				break;
+#    ifdef __HAIR__
+#      if NODES_FEATURE(NODE_FEATURE_HAIR)
+			case NODE_HAIR_INFO:
+				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+				break;
+#      endif  /* NODES_FEATURE(NODE_FEATURE_HAIR) */
+#    endif  /* __HAIR__ */
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_2)
+			case NODE_MAPPING:
+				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_MIN_MAX:
+				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_CAMERA:
+				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
+				break;
+#  ifdef __TEXTURES__
 			case NODE_TEX_ENVIRONMENT:
 				svm_node_tex_environment(kg, sd, stack, node);
 				break;
@@ -267,9 +388,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_GRADIENT:
 				svm_node_tex_gradient(sd, stack, node);
 				break;
-			case NODE_TEX_NOISE:
-				svm_node_tex_noise(kg, sd, stack, node, &offset);
-				break;
 			case NODE_TEX_VORONOI:
 				svm_node_tex_voronoi(kg, sd, stack, node, &offset);
 				break;
@@ -288,55 +406,34 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_BRICK:
 				svm_node_tex_brick(kg, sd, stack, node, &offset);
 				break;
-#endif
-			case NODE_CAMERA:
-				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_GEOMETRY:
-				svm_node_geometry(kg, sd, stack, node.y, node.z);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_GEOMETRY_BUMP_DX:
-				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_GEOMETRY_BUMP_DY:
-				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_LIGHT_PATH:
-				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
-				break;
-			case NODE_OBJECT_INFO:
-				svm_node_object_info(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_PARTICLE_INFO:
-				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+			case NODE_NORMAL:
+				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-#ifdef __HAIR__
-			case NODE_HAIR_INFO:
-				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+			case NODE_LIGHT_FALLOFF:
+				svm_node_light_falloff(sd, stack, node);
 				break;
-#endif
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#endif
-			case NODE_CONVERT:
-				svm_node_convert(sd, stack, node.y, node.z, node.w);
+#if NODES_GROUP(NODE_GROUP_LEVEL_3)
+			case NODE_RGB_CURVES:
+				svm_node_rgb_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_F:
-				svm_node_value_f(kg, sd, stack, node.y, node.z);
+			case NODE_VECTOR_CURVES:
+				svm_node_vector_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_V:
-				svm_node_value_v(kg, sd, stack, node.y, &offset);
+			case NODE_TANGENT:
+				svm_node_tangent(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
+			case NODE_NORMAL_MAP:
+				svm_node_normal_map(kg, sd, stack, node);
+				break;
+#  ifdef __EXTRA_NODES__
 			case NODE_INVERT:
 				svm_node_invert(sd, stack, node.y, node.z, node.w);
 				break;
-			case NODE_GAMMA:
-				svm_node_gamma(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_BRIGHTCONTRAST:
-				svm_node_brightness(sd, stack, node.y, node.z, node.w);
-				break;
 			case NODE_MIX:
 				svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
@@ -352,30 +449,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_COMBINE_HSV:
 				svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-			case NODE_HSV:
-				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_ATTR:
-				svm_node_attr(kg, sd, stack, node);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_ATTR_BUMP_DX:
-				svm_node_attr_bump_dx(kg, sd, stack, node);
-				break;
-			case NODE_ATTR_BUMP_DY:
-				svm_node_attr_bump_dy(kg, sd, stack, node);
-				break;
-#endif
-			case NODE_FRESNEL:
-				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_LAYER_WEIGHT:
-				svm_node_layer_weight(sd, stack, node);
+			case NODE_VECTOR_TRANSFORM:
+				svm_node_vector_transform(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
 			case NODE_WIREFRAME:
-				svm_node_wireframe(kg, sd, stack, node.y, node.z, node.w);
+				svm_node_wireframe(kg, sd, stack, node);
 				break;
 			case NODE_WAVELENGTH:
 				svm_node_wavelength(sd, stack, node.y, node.z);
@@ -383,70 +461,25 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_BLACKBODY:
 				svm_node_blackbody(kg, sd, stack, node.y, node.z);
 				break;
-			case NODE_SET_DISPLACEMENT:
-				svm_node_set_displacement(sd, stack, node.y);
-				break;
-			case NODE_SET_BUMP:
-				svm_node_set_bump(kg, sd, stack, node);
-				break;
-			case NODE_MATH:
-				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_MATH:
-				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_TRANSFORM:
-				svm_node_vector_transform(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL:
-				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_MAPPING:
-				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_MIN_MAX:
-				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_TEX_COORD:
-				svm_node_tex_coord(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_TEX_COORD_BUMP_DX:
-				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-			case NODE_TEX_COORD_BUMP_DY:
-				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-			case NODE_CLOSURE_SET_NORMAL:
-				svm_node_set_normal(kg, sd, stack, node.y, node.z );
-				break;
-			case NODE_RGB_RAMP:
-				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
-				break;
-			case NODE_RGB_CURVES:
-				svm_node_rgb_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_VECTOR_CURVES:
-				svm_node_vector_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_LIGHT_FALLOFF:
-				svm_node_light_falloff(sd, stack, node);
-				break;
-#endif
-			case NODE_TANGENT:
-				svm_node_tangent(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL_MAP:
-				svm_node_normal_map(kg, sd, stack, node);
+#  endif  /* __EXTRA_NODES__ */
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__)
+			case NODE_TEX_VOXEL:
+				svm_node_tex_voxel(kg, sd, stack, node, &offset);
 				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__) */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
 			case NODE_END:
+				return;
 			default:
+				kernel_assert(!"Unknown node type was passed to the SVM machine");
 				return;
 		}
 	}
 }
 
+#undef NODES_GROUP
+#undef NODES_FEATURE
+
 CCL_NAMESPACE_END
 
 #ifdef __CAMERA_RAY_NODES__
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index fd0ea7fef31..025ae96f59d 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
-	if(sd->object != OBJECT_NONE) {
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
 		/* find attribute by unique id */
 		uint id = node.y;
-		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+		uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-		attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+		attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 		
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 15257aed92e..b750ad87b7f 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -36,48 +36,12 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
 {
-	/* Output */
-	float3 color_rgb = make_float3(0.0f, 0.0f, 0.0f);
-
 	/* Input */
 	float temperature = stack_load_float(stack, temperature_offset);
 
-	if (temperature < BB_DRAPPER) {
-		/* just return very very dim red */
-		color_rgb = make_float3(1.0e-6f,0.0f,0.0f);
-	}
-	else if (temperature <= BB_MAX_TABLE_RANGE) {
-		/* This is the overall size of the table */
-		const int lookuptablesize = 956;
-		const float lookuptablenormalize = 1.0f/956.0f;
-
-		/* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors
-		just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */
-		float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
-
-		int blackbody_table_offset = kernel_data.tables.blackbody_offset;
-
-		/* Retrieve colors from the lookup table */
-		float lutval = t*lookuptablenormalize;
-		float R = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*1.0f)*lookuptablenormalize;
-		float G = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*2.0f)*lookuptablenormalize;
-		float B = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-
-		R = powf(R, BB_TABLE_YPOWER);
-		G = powf(G, BB_TABLE_YPOWER);
-		B = powf(B, BB_TABLE_YPOWER);
-
-		color_rgb = make_float3(R, G, B);
-	}
-
-	/* Luminance */
-	float l = linear_rgb_to_gray(color_rgb);
-	if (l != 0.0f)
-		color_rgb /= l;
+	float3 color_rgb = svm_math_blackbody_color(temperature);
 
-	if (stack_valid(col_offset))
+	if(stack_valid(col_offset))
 		stack_store_float3(stack, col_offset, color_rgb);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 97c2b545c5f..9b0cf5ab8c4 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -21,6 +21,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline float brick_noise(int n) /* fast integer noise */
 {
 	int nn;
+	n = (n + 1013) & 0x7fffffff;
 	n = (n >> 13) ^ n;
 	nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 0x7fffffff;
 	return 0.5f * ((float)nn / 1073741824.0f);
@@ -47,7 +48,7 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
 	y = p.y - row_height*rownum;
 
 	return make_float2(
-		clamp((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias), 0.0f, 1.0f),
+		saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)),
 
 		(x < mortar_size || y < mortar_size ||
 		x > (brick_width - mortar_size) ||
@@ -95,10 +96,7 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	
 	if(f != 1.0f) {
 		float facm = 1.0f - tint;
-
-		color1.x = facm * (color1.x) + tint * color2.x;
-		color1.y = facm * (color1.y) + tint * color2.y;
-		color1.z = facm * (color1.z) + tint * color2.z;
+		color1 = facm * color1 + tint * color2;
 	}
 
 	if(stack_valid(color_offset))
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9b330b3213f..e4d545a00ae 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -32,7 +32,7 @@ ccl_device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color,
 	color.y = max(a*color.y + b, 0.0f);
 	color.z = max(a*color.z + b, 0.0f);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index bfe9289fa02..00678a49d70 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,17 +23,17 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, sd->P);
+	vector = transform_point(&tfm, ccl_fetch(sd, P));
 	zdepth = vector.z;
 	distance = len(vector);
 
-	if (stack_valid(out_vector))
+	if(stack_valid(out_vector))
 		stack_store_float3(stack, out_vector, normalize(vector));
 
-	if (stack_valid(out_zdepth))
+	if(stack_valid(out_zdepth))
 		stack_store_float(stack, out_zdepth, zdepth);
 
-	if (stack_valid(out_distance))
+	if(stack_valid(out_distance))
 		stack_store_float(stack, out_distance, distance);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index e0408ad334a..186bf7df55f 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 30110db3ef9..c495ebb35bd 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,10 +25,14 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 			sc->data0 = eta;
 			sc->data1 = 0.0f;
 			sc->data2 = 0.0f;
-			sd->flag |= bsdf_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
+		}
+		else {
+			sc->data0 = 0.0f;
+			sc->data1 = 0.0f;
+			sc->data2 = 0.0f;
+			ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 		}
-		else
-			sd->flag |= bsdf_reflection_setup(sc);
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
 		sc->data0 = roughness;
@@ -36,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 	}
 	else {
 		sc->data0 = roughness;
@@ -46,23 +50,26 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_ggx_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 	}
 }
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 
-	if(sd->num_closure < MAX_CLOSURE) {
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight *= mix_weight;
 		sc->type = type;
+		sc->data0 = 0.0f;
+		sc->data1 = 0.0f;
+		sc->data2 = 0.0f;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 		return sc;
 	}
 
@@ -71,14 +78,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = sc->weight * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -90,14 +98,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -121,7 +130,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -139,13 +148,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_diffuse_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
 				}
 				else {
 					sc->data0 = roughness;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_oren_nayar_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
 				}
 			}
 			break;
@@ -158,7 +167,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_translucent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
 			}
 			break;
 		}
@@ -170,7 +179,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_transparent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 			}
 			break;
 		}
@@ -192,13 +201,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					sd->flag |= bsdf_reflection_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					sd->flag |= bsdf_microfacet_ggx_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
 			}
 
 			break;
@@ -216,7 +225,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -224,7 +233,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
 
-					sd->flag |= bsdf_refraction_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
 				}
 				else {
 					sc->data0 = param1;
@@ -232,9 +241,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data2 = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 					else
-						sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 				}
 			}
 
@@ -251,15 +260,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, sd->I);
+			float cosNO = dot(N, ccl_fetch(sd, I));
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
 			/* reflection */
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight;
 			float sample_weight = sc->sample_weight;
 
@@ -280,15 +289,17 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 			/* refraction */
-			sc = &sd->closure[sd->num_closure];
-			sc->weight = weight;
-			sc->sample_weight = sample_weight;
+			if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+				sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+				sc->weight = weight;
+				sc->sample_weight = sample_weight;
 
-			sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
+				sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
 
-			if(sc) {
-				sc->N = N;
-				svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				if(sc) {
+					sc->N = N;
+					svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				}
 			}
 
 			break;
@@ -328,12 +339,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				sc->data2 = 0.0f;
 
-				if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
-					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
-				else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
-					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
+				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
 			}
 			break;
 		}
@@ -344,10 +355,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				/* sigma */
-				sc->data0 = clamp(param1, 0.0f, 1.0f);
+				sc->data0 = saturate(param1);
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
-				sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
 			}
 			break;
 		}
@@ -362,10 +373,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = param2;
 				sc->data2 = 0.0f;
 				
-				if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					sd->flag |= bsdf_diffuse_toon_setup(sc);
+				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
+					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
 				else
-					sd->flag |= bsdf_glossy_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
 			}
 			break;
 		}
@@ -373,7 +384,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
 			
-			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -384,11 +395,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * spawned by transmission from the front */
 					sc->weight = make_float3(1.0f, 1.0f, 1.0f);
 					sc->N = N;
-					sd->flag |= bsdf_transparent_setup(sc);
+					sc->data0 = 0.0f;
+					sc->data1 = 0.0f;
+					sc->data2 = 0.0f;
+					ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 				}
 			}
 			else {
-				ShaderClosure *sc = &sd->closure[sd->num_closure];
+				ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 				sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -397,18 +411,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = param2;
 					sc->data2 = -stack_load_float(stack, data_node.z);
 
-					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
-						sc->T = normalize(sd->dPdv);
+					if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
+						sc->T = normalize(ccl_fetch(sd, dPdv));
 						sc->data2 = 0.0f;
 					}
 					else
-						sc->T = sd->dPdu;
+						sc->T = normalize(ccl_fetch(sd, dPdu));
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						sd->flag |= bsdf_hair_reflection_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
 					}
 					else {
-						sd->flag |= bsdf_hair_transmission_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
 					}
 				}
 			}
@@ -418,9 +432,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 #ifdef __SUBSURFACE__
+#ifndef __SPLIT_KERNEL__
+#  define sc_next(sc) sc++
+#  else
+#  define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
+#  endif
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID: {
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
@@ -430,7 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;
 
-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
 				/* radius * scale */
 				float3 radius = stack_load_float3(stack, data_node.z)*param1;
 				/* sharpness */
@@ -450,10 +469,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.y) > 0.0f) {
@@ -467,10 +486,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.z) > 0.0f) {
@@ -484,15 +503,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 			}
 
 			break;
 		}
+#  undef sc_next
 #endif
 		default:
 			break;
@@ -520,7 +540,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);
 
 			if(sc) {
-				sd->flag |= volume_absorption_setup(sc);
+				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
 			}
 			break;
 		}
@@ -528,9 +548,10 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight * density);
 
 			if(sc) {
-				float g = param2;
-				sc->data0 = g;
-				sd->flag |= volume_henyey_greenstein_setup(sc);
+				sc->data0 = param2; /* g */
+				sc->data1 = 0.0f;
+				sc->data2 = 0.0f;
+				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
 			}
 			break;
 		}
@@ -555,7 +576,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
 
-	sd->flag |= SD_EMISSION;
+	ccl_fetch(sd, flag) |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -589,7 +610,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
 
-	sd->flag |= SD_HOLDOUT;
+	ccl_fetch(sd, flag) |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -607,15 +628,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
 
-	sd->flag |= SD_AO;
+	ccl_fetch(sd, flag) |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	if(sd->num_closure < MAX_CLOSURE)
-		sd->closure[sd->num_closure].weight = weight;
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+		sc->weight = weight;
+	}
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -650,7 +673,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset);
 
 	float weight = stack_load_float(stack, weight_offset);
-	weight = clamp(weight, 0.0f, 1.0f);
+	weight = saturate(weight);
 
 	float in_weight = (stack_valid(in_weight_offset))? stack_load_float(stack, in_weight_offset): 1.0f;
 
@@ -665,7 +688,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	sd->N = normal;
+	ccl_fetch(sd, N) = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index b221e0728ec..34080377083 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 6cd5ee4b375..8d4b07c9973 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	/* get surface tangents from normal */
-	float3 Rx = cross(sd->dP.dy, normal_in);
-	float3 Ry = cross(normal_in, sd->dP.dx);
+	float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in);
+	float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx);
 
 	/* get bump values */
 	uint c_offset, x_offset, y_offset, strength_offset;
@@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	float h_y = stack_load_float(stack, y_offset);
 
 	/* compute surface gradient and determinant */
-	float det = dot(sd->dP.dx, Rx);
+	float det = dot(ccl_fetch(sd, dP).dx, Rx);
 	float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry;
 
 	float absdet = fabsf(det);
@@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
 {
 	float d = stack_load_float(stack, fac_offset);
-	sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
+	ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 5def52205eb..23c97d80cb0 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(sd->I, normal_in));
+		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index c4749e7b936..b645ff3f0f9 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -21,14 +21,14 @@ ccl_device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint
 	float3 color = stack_load_float3(stack, in_color);
 	float gamma = stack_load_float(stack, in_gamma);
 
-	if (color.x > 0.0f)
+	if(color.x > 0.0f)
 		color.x = powf(color.x, gamma);
-	if (color.y > 0.0f)
+	if(color.y > 0.0f)
 		color.y = powf(color.y, gamma);
-	if (color.z > 0.0f)
+	if(color.z > 0.0f)
 		color.z = powf(color.z, gamma);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index fe681ec92af..bb06254c3a9 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P; break;
-		case NODE_GEOM_N: data = sd->N; break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
+		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = sd->I; break;
-		case NODE_GEOM_Ng: data = sd->Ng; break;
+		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
+		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
 #endif
 	}
 
@@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
 		default: data = 0.0f; break;
 	}
 
@@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = sd->curve_transparency;
+			data = ccl_fetch(sd, curve_transparency);
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index a4b3c0583f7..53d7b4f812c 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -66,7 +66,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 	float3 co = stack_load_float3(stack, co_offset);
 
 	float f = svm_gradient(co, (NodeGradientType)type);
-	f = clamp(f, 0.0f, 1.0f);
+	f = saturate(f);
 
 	if(stack_valid(fac_offset))
 		stack_store_float(stack, fac_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 11dfc4f096b..1f2cad60df7 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_HSV_H__
@@ -46,7 +46,12 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 	color.y = fac*color.y + (1.0f - fac)*in_color.y;
 	color.z = fac*color.z + (1.0f - fac)*in_color.z;
 
-	if (stack_valid(out_color_offset))
+	/* Clamp color to prevent negative values caused by oversaturation. */
+	color.x = max(color.x, 0.0f);
+	color.y = max(color.y, 0.0f);
+	color.z = max(color.z, 0.0f);
+
+	if(stack_valid(out_color_offset))
 		stack_store_float3(stack, out_color_offset, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index a7abeda18e5..caf0b37ba35 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -65,7 +65,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 	float4 r;
 	int ix, iy, nix, niy;
-	if (interpolation == INTERPOLATION_CLOSEST) {
+	if(interpolation == INTERPOLATION_CLOSEST) {
 		svm_image_texture_frac(x*width, &ix);
 		svm_image_texture_frac(y*height, &iy);
 
@@ -251,9 +251,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
 		case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
 		case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
-		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
 		case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
 		case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
@@ -354,6 +354,12 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 #endif
 
+/* Remap coordnate from 0..1 box to -1..-1 */
+ccl_device_inline float3 texco_remap_square(float3 co)
+{
+	return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
+}
+
 ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
@@ -362,8 +368,20 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 	decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
 
 	float3 co = stack_load_float3(stack, co_offset);
+	float2 tex_co;
 	uint use_alpha = stack_valid(alpha_offset);
-	float4 f = svm_image_texture(kg, id, co.x, co.y, srgb, use_alpha);
+	if(node.w == NODE_IMAGE_PROJ_SPHERE) {
+		co = texco_remap_square(co);
+		tex_co = map_to_sphere(co);
+	}
+	else if(node.w == NODE_IMAGE_PROJ_TUBE) {
+		co = texco_remap_square(co);
+		tex_co = map_to_tube(co);
+	}
+	else {
+		tex_co = make_float2(co.x, co.y);
+	}
+	float4 f = svm_image_texture(kg, id, tex_co.x, tex_co.y, srgb, use_alpha);
 
 	if(stack_valid(out_offset))
 		stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -374,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = sd->N;
+	float3 N = ccl_fetch(sd, N);
 
-	N = sd->N;
-	if(sd->object != OBJECT_NONE)
+	N = ccl_fetch(sd, N);
+	if(ccl_fetch(sd, object) != OBJECT_NONE)
 		object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
@@ -415,17 +433,17 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 		/* in case of blending, test for mixes between two textures */
 		if(N.z < (1.0f - limit)*(N.y + N.x)) {
 			weight.x = N.x/(N.x + N.y);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.y = 1.0f - weight.x;
 		}
 		else if(N.x < (1.0f - limit)*(N.y + N.z)) {
 			weight.y = N.y/(N.y + N.z);
-			weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.y = saturate((weight.y - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.y;
 		}
 		else if(N.y < (1.0f - limit)*(N.x + N.z)) {
 			weight.x = N.x/(N.x + N.z);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.x;
 		}
 		else {
@@ -435,6 +453,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 			weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
 		}
 	}
+	else {
+		/* Desperate mode, no valid choice anyway, fallback to one side.*/
+		weight.x = 1.0f;
+	}
 
 	/* now fetch textures */
 	uint co_offset, out_offset, alpha_offset, srgb;
@@ -459,7 +481,6 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 		stack_store_float(stack, alpha_offset, f.w);
 }
 
-
 ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index eb47e9ad4ab..5ce858e2e5d 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -30,7 +30,7 @@ ccl_device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint
 	color.y = invert(color.y, factor);
 	color.z = invert(color.z, factor);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index da544c63ae0..a235dd35224 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = sd->ray_length; break;
-		case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
-		case NODE_LP_ray_transparent: info = sd->transparent_depth; break;
+		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break;
+		case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break;
 	}
 
 	stack_store_float(stack, out_offset, info);
@@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = sd->ray_length*sd->ray_length;
+		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
 		strength *= squared/(smooth + squared);
 	}
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index b661f5cacf8..ac87c77d719 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index c9fa8502dd1..0a890545af4 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 1ce9386e40e..d633e54ed8d 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -11,99 +11,11 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
-{
-	float Fac;
-
-	if(type == NODE_MATH_ADD)
-		Fac = Fac1 + Fac2;
-	else if(type == NODE_MATH_SUBTRACT)
-		Fac = Fac1 - Fac2;
-	else if(type == NODE_MATH_MULTIPLY)
-		Fac = Fac1*Fac2;
-	else if(type == NODE_MATH_DIVIDE)
-		Fac = safe_divide(Fac1, Fac2);
-	else if(type == NODE_MATH_SINE)
-		Fac = sinf(Fac1);
-	else if(type == NODE_MATH_COSINE)
-		Fac = cosf(Fac1);
-	else if(type == NODE_MATH_TANGENT)
-		Fac = tanf(Fac1);
-	else if(type == NODE_MATH_ARCSINE)
-		Fac = safe_asinf(Fac1);
-	else if(type == NODE_MATH_ARCCOSINE)
-		Fac = safe_acosf(Fac1);
-	else if(type == NODE_MATH_ARCTANGENT)
-		Fac = atanf(Fac1);
-	else if(type == NODE_MATH_POWER)
-		Fac = safe_powf(Fac1, Fac2);
-	else if(type == NODE_MATH_LOGARITHM)
-		Fac = safe_logf(Fac1, Fac2);
-	else if(type == NODE_MATH_MINIMUM)
-		Fac = fminf(Fac1, Fac2);
-	else if(type == NODE_MATH_MAXIMUM)
-		Fac = fmaxf(Fac1, Fac2);
-	else if(type == NODE_MATH_ROUND)
-		Fac = floorf(Fac1 + 0.5f);
-	else if(type == NODE_MATH_LESS_THAN)
-		Fac = Fac1 < Fac2;
-	else if(type == NODE_MATH_GREATER_THAN)
-		Fac = Fac1 > Fac2;
-	else if(type == NODE_MATH_MODULO)
-		Fac = safe_modulo(Fac1, Fac2);
-    else if(type == NODE_MATH_ABSOLUTE)
-        Fac = fabsf(Fac1);
-	else if(type == NODE_MATH_CLAMP)
-		Fac = clamp(Fac1, 0.0f, 1.0f);
-	else
-		Fac = 0.0f;
-	
-	return Fac;
-}
-
-ccl_device float average_fac(float3 v)
-{
-	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
-}
-
-ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
-{
-	if(type == NODE_VECTOR_MATH_ADD) {
-		*Vector = Vector1 + Vector2;
-		*Fac = average_fac(*Vector);
-	}
-	else if(type == NODE_VECTOR_MATH_SUBTRACT) {
-		*Vector = Vector1 - Vector2;
-		*Fac = average_fac(*Vector);
-	}
-	else if(type == NODE_VECTOR_MATH_AVERAGE) {
-		*Fac = len(Vector1 + Vector2);
-		*Vector = normalize(Vector1 + Vector2);
-	}
-	else if(type == NODE_VECTOR_MATH_DOT_PRODUCT) {
-		*Fac = dot(Vector1, Vector2);
-		*Vector = make_float3(0.0f, 0.0f, 0.0f);
-	}
-	else if(type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
-		float3 c = cross(Vector1, Vector2);
-		*Fac = len(c);
-		*Vector = normalize(c);
-	}
-	else if(type == NODE_VECTOR_MATH_NORMALIZE) {
-		*Fac = len(Vector1);
-		*Vector = normalize(Vector1);
-	}
-	else {
-		*Fac = 0.0f;
-		*Vector = make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
 /* Nodes */
 
 ccl_device void svm_node_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint f1_offset, uint f2_offset, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
new file mode 100644
index 00000000000..645cbd3fc73
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float average_fac(float3 v)
+{
+	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
+}
+
+ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
+{
+	if(type == NODE_VECTOR_MATH_ADD) {
+		*Vector = Vector1 + Vector2;
+		*Fac = average_fac(*Vector);
+	}
+	else if(type == NODE_VECTOR_MATH_SUBTRACT) {
+		*Vector = Vector1 - Vector2;
+		*Fac = average_fac(*Vector);
+	}
+	else if(type == NODE_VECTOR_MATH_AVERAGE) {
+		*Fac = len(Vector1 + Vector2);
+		*Vector = normalize(Vector1 + Vector2);
+	}
+	else if(type == NODE_VECTOR_MATH_DOT_PRODUCT) {
+		*Fac = dot(Vector1, Vector2);
+		*Vector = make_float3(0.0f, 0.0f, 0.0f);
+	}
+	else if(type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
+		float3 c = cross(Vector1, Vector2);
+		*Fac = len(c);
+		*Vector = normalize(c);
+	}
+	else if(type == NODE_VECTOR_MATH_NORMALIZE) {
+		*Fac = len(Vector1);
+		*Vector = normalize(Vector1);
+	}
+	else {
+		*Fac = 0.0f;
+		*Vector = make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
+{
+	float Fac;
+
+	if(type == NODE_MATH_ADD)
+		Fac = Fac1 + Fac2;
+	else if(type == NODE_MATH_SUBTRACT)
+		Fac = Fac1 - Fac2;
+	else if(type == NODE_MATH_MULTIPLY)
+		Fac = Fac1*Fac2;
+	else if(type == NODE_MATH_DIVIDE)
+		Fac = safe_divide(Fac1, Fac2);
+	else if(type == NODE_MATH_SINE)
+		Fac = sinf(Fac1);
+	else if(type == NODE_MATH_COSINE)
+		Fac = cosf(Fac1);
+	else if(type == NODE_MATH_TANGENT)
+		Fac = tanf(Fac1);
+	else if(type == NODE_MATH_ARCSINE)
+		Fac = safe_asinf(Fac1);
+	else if(type == NODE_MATH_ARCCOSINE)
+		Fac = safe_acosf(Fac1);
+	else if(type == NODE_MATH_ARCTANGENT)
+		Fac = atanf(Fac1);
+	else if(type == NODE_MATH_POWER)
+		Fac = safe_powf(Fac1, Fac2);
+	else if(type == NODE_MATH_LOGARITHM)
+		Fac = safe_logf(Fac1, Fac2);
+	else if(type == NODE_MATH_MINIMUM)
+		Fac = fminf(Fac1, Fac2);
+	else if(type == NODE_MATH_MAXIMUM)
+		Fac = fmaxf(Fac1, Fac2);
+	else if(type == NODE_MATH_ROUND)
+		Fac = floorf(Fac1 + 0.5f);
+	else if(type == NODE_MATH_LESS_THAN)
+		Fac = Fac1 < Fac2;
+	else if(type == NODE_MATH_GREATER_THAN)
+		Fac = Fac1 > Fac2;
+	else if(type == NODE_MATH_MODULO)
+		Fac = safe_modulo(Fac1, Fac2);
+	else if(type == NODE_MATH_ABSOLUTE)
+		Fac = fabsf(Fac1);
+	else if(type == NODE_MATH_CLAMP)
+		Fac = saturate(Fac1);
+	else
+		Fac = 0.0f;
+	
+	return Fac;
+}
+
+ccl_device float3 svm_math_blackbody_color(float t) {
+	/* Calculate color in range 800..12000 using an approximation
+	 * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
+	 * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
+	 * which is enough to get the same 8 bit/channel color.
+	 */
+
+	const float rc[6][3] = {
+		{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
+		{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
+		{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
+		{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
+		{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
+		{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
+	};
+
+	const float gc[6][3] = {
+		{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
+		{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
+		{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
+		{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
+		{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
+		{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
+	};
+
+	const float bc[6][4] = {
+		{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
+		{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
+		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
+	};
+
+	if(t >= 12000.0f)
+		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+
+	/* Define a macro to reduce stack usage for nvcc */
+#define MAKE_BB_RGB(i) make_float3(\
+		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
+		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
+		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
+
+	if(t >= 6365.0f)
+		return MAKE_BB_RGB(5);
+	if(t >= 3315.0f)
+		return MAKE_BB_RGB(4);
+	if(t >= 1902.0f)
+		return MAKE_BB_RGB(3);
+	if(t >= 1449.0f)
+		return MAKE_BB_RGB(2);
+	if(t >= 1167.0f)
+		return MAKE_BB_RGB(1);
+	if(t >= 965.0f)
+		return MAKE_BB_RGB(0);
+
+#undef MAKE_BB_RGB
+
+	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+	return make_float3(4.70366907f, 0.0f, 0.0f);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index edc3903865e..6111214acba 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -254,16 +254,16 @@ ccl_device float3 svm_mix_clamp(float3 col)
 {
 	float3 outcol = col;
 
-	outcol.x = clamp(col.x, 0.0f, 1.0f);
-	outcol.y = clamp(col.y, 0.0f, 1.0f);
-	outcol.z = clamp(col.z, 0.0f, 1.0f);
+	outcol.x = saturate(col.x);
+	outcol.y = saturate(col.y);
+	outcol.z = saturate(col.z);
 
 	return outcol;
 }
 
 ccl_device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
-	float t = clamp(fac, 0.0f, 1.0f);
+	float t = saturate(fac);
 
 	switch(type) {
 		case NODE_MIX_BLEND: return svm_mix_blend(t, c1, c2);
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 61171d6849c..09eba31945e 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * from "Texturing and Modelling: A procedural approach"
  */
 
-ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0f;
@@ -53,7 +53,7 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, flo
  * octaves: number of frequencies in the fBm
  */
 
-ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0f;
@@ -82,7 +82,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset)
+ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -117,7 +117,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -154,7 +154,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNois
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = powf(lacunarity, -H);
@@ -168,7 +168,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 	for(i = 1; i < float_to_int(octaves); i++) {
 		p *= lacunarity;
-		weight = clamp(signal * gain, 0.0f, 1.0f);
+		weight = saturate(signal * gain);
 		signal = offset - fabsf(snoise(p));
 		signal *= signal;
 		signal *= weight;
@@ -183,18 +183,16 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 ccl_device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float3 p)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
-
 	if(type == NODE_MUSGRAVE_MULTIFRACTAL)
-		return intensity*noise_musgrave_multi_fractal(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_FBM)
-		return intensity*noise_musgrave_fBm(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_HYBRID_MULTIFRACTAL)
-		return intensity*noise_musgrave_hybrid_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_RIDGED_MULTIFRACTAL)
-		return intensity*noise_musgrave_ridged_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_HETERO_TERRAIN)
-		return intensity*noise_musgrave_hetero_terrain(p, basis, dimension, lacunarity, octaves, offset);
+		return intensity*noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, offset);
 	
 	return 0.0f;
 }
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 869341c81f4..c77c2a1c482 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -290,40 +290,6 @@ ccl_device_noinline float perlin(float x, float y, float z)
 }
 #endif
 
-#if 0 // unused
-ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
-{
-	int X; float fx = floorfrac(x, &X);
-	int Y; float fy = floorfrac(y, &Y);
-	int Z; float fz = floorfrac(z, &Z);
-
-	int3 p;
-
-	p.x = max(quick_floor(pperiod.x), 1);
-	p.y = max(quick_floor(pperiod.y), 1);
-	p.z = max(quick_floor(pperiod.z), 1);
-
-	float u = fade(fx);
-	float v = fade(fy);
-	float w = fade(fz);
-
-	float result;
-
-	result = nerp (w, nerp (v, nerp (u, grad (phash (X  , Y  , Z  , p), fx	 , fy	 , fz	  ),
-										grad (phash (X+1, Y  , Z  , p), fx-1.0f, fy	 , fz	  )),
-							   nerp (u, grad (phash (X  , Y+1, Z  , p), fx	 , fy-1.0f, fz	  ),
-										grad (phash (X+1, Y+1, Z  , p), fx-1.0f, fy-1.0f, fz	  ))),
-					  nerp (v, nerp (u, grad (phash (X  , Y  , Z+1, p), fx	 , fy	 , fz-1.0f ),
-										grad (phash (X+1, Y  , Z+1, p), fx-1.0f, fy	 , fz-1.0f )),
-							   nerp (u, grad (phash (X  , Y+1, Z+1, p), fx	 , fy-1.0f, fz-1.0f ),
-										grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f ))));
-	float r = scale3(result);
-
-	/* can happen for big coordinates, things even out to 0.0 then anyway */
-	return (isfinite(r))? r: 0.0f;
-}
-#endif
-
 /* perlin noise in range 0..1 */
 ccl_device float noise(float3 p)
 {
@@ -367,20 +333,5 @@ ccl_device ssef cellnoise_color(const ssef& p)
 }
 #endif
 
-#if 0 // unused
-/* periodic perlin noise in range 0..1 */
-ccl_device float pnoise(float3 p, float3 pperiod)
-{
-	float r = perlin_periodic(p.x, p.y, p.z, pperiod);
-	return 0.5f*r + 0.5f;
-}
-
-/* periodic perlin noise in range -1..1 */
-ccl_device float psnoise(float3 p, float3 pperiod)
-{
-	return perlin_periodic(p.x, p.y, p.z, pperiod);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 5d5cfe6ffcc..62ff38cf1c5 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -20,23 +20,22 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
 	int hard = 0;
 
 	if(distortion != 0.0f) {
 		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
 
-		r.x = noise_basis(p + offset, basis) * distortion;
-		r.y = noise_basis(p, basis) * distortion;
-		r.z = noise_basis(p - offset, basis) * distortion;
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
 		p += r;
 	}
 
-	*fac = noise_turbulence(p, basis, detail, hard);
+	*fac = noise_turbulence(p, detail, hard);
 	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), basis, detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), basis, detail, hard));
+		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 }
 
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 8695031b8b9..53abef71012 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,10 +28,10 @@ ccl_device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack,
 	direction.z = __int_as_float(node1.z);
 	direction = normalize(direction);
 
-	if (stack_valid(out_normal_offset))
+	if(stack_valid(out_normal_offset))
 		stack_store_float3(stack, out_normal_offset, direction);
 
-	if (stack_valid(out_dot_offset))
+	if(stack_valid(out_dot_offset))
 		stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
 }
 
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 55eee3d24c3..062ab013b1f 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_RAMP_H__
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate)
 {
-	f = clamp(f, 0.0f, 1.0f)*(RAMP_TABLE_SIZE-1);
+	f = saturate(f)*(RAMP_TABLE_SIZE-1);
 
 	/* clamp int as well in case of NaN */
 	int i = clamp(float_to_int(f), 0, RAMP_TABLE_SIZE-1);
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 111d5d47988..6f51b163756 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,7 +28,7 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *s
 	/* Combine, and convert back to RGB */
 	float3 color = hsv_to_rgb(make_float3(hue, saturation, value));
 
-	if (stack_valid(color_out))
+	if(stack_valid(color_out))
 		stack_store_float3(stack, color_out, color);
 }
 
@@ -42,11 +42,11 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *
 	/* Convert to HSV */
 	color = rgb_to_hsv(color);
 
-	if (stack_valid(hue_out))
+	if(stack_valid(hue_out))
 		stack_store_float(stack, hue_out, color.x);
-	if (stack_valid(saturation_out))
+	if(stack_valid(saturation_out))
 		stack_store_float(stack, saturation_out, color.y);
-	if (stack_valid(value_out))
+	if(stack_valid(value_out))
 		stack_store_float(stack, value_out, color.z);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
index c8e7e34f87d..63570dd6942 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_vector.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +22,7 @@ ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_of
 {
 	float vector = stack_load_float(stack, in_offset);
 
-	if (stack_valid(out_offset))
+	if(stack_valid(out_offset))
 		stack_store_float(stack, out_offset+vector_index, vector);
 }
 
@@ -30,10 +30,10 @@ ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivec
 {
 	float3 vector = stack_load_float3(stack, ivector_offset);
 
-	if (stack_valid(out_offset)) {
-		if (vector_index == 0)
+	if(stack_valid(out_offset)) {
+		if(vector_index == 0)
 			stack_store_float(stack, out_offset, vector.x);
-		else if (vector_index == 1)
+		else if(vector_index == 1)
 			stack_store_float(stack, out_offset, vector.y);
 		else
 			stack_store_float(stack, out_offset, vector.z);
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 500b5146931..4c8e3a32271 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index a17e4a25efe..eebd9bee420 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -11,67 +11,85 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   int path_flag,
+                                   float *stack,
+                                   uint4 node,
+                                   int *offset)
 {
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P);
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P));
 			else
-				data = transform_point(&tfm, sd->P + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -81,61 +99,79 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_f
 	stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dx;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dx);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -144,65 +180,83 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, in
 
 	stack_store_float3(stack, out_offset, data);
 #else
-	svm_node_tex_coord(kg, sd, stack, type, out_offset);
+	svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dy;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dy);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -211,7 +265,7 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, in
 
 	stack_store_float3(stack, out_offset, data);
 #else
-	svm_node_tex_coord(kg, sd, stack, type, out_offset);
+	svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
@@ -227,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(sd->object == OBJECT_NONE) {
+		if(ccl_fetch(sd, object) == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -248,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
 		float3 normal;
 
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
 		}
 		else {
-			normal = sd->Ng;
+			normal = ccl_fetch(sd, Ng);
 			object_inverse_normal_transform(kg, sd, &normal);
 		}
 
@@ -283,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = normalize(sd->N + (N - sd->N)*strength);
+		N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -313,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(attr_offset == ATTR_STD_NOT_FOUND)
-			generated = sd->P;
+			generated = ccl_fetch(sd, P);
 		else
 			generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 
@@ -326,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
+	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index d97c85db36a..dcb00f7dd55 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -11,266 +11,14 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-/* Voronoi Distances */
-
-#if 0
-ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
-{
-#if 0
-	if(distance_metric == NODE_VORONOI_DISTANCE_SQUARED)
-#endif
-		return dot(d, d);
-#if 0
-	if(distance_metric == NODE_VORONOI_ACTUAL_DISTANCE)
-		return len(d);
-	if(distance_metric == NODE_VORONOI_MANHATTAN)
-		return fabsf(d.x) + fabsf(d.y) + fabsf(d.z);
-	if(distance_metric == NODE_VORONOI_CHEBYCHEV)
-		return fmaxf(fabsf(d.x), fmaxf(fabsf(d.y), fabsf(d.z)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_H)
-		return sqrtf(fabsf(d.x)) + sqrtf(fabsf(d.y)) + sqrtf(fabsf(d.y));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_4)
-		return sqrtf(sqrtf(dot(d*d, d*d)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY)
-		return powf(powf(fabsf(d.x), e) + powf(fabsf(d.y), e) + powf(fabsf(d.z), e), 1.0f/e);
-	
-	return 0.0f;
-#endif
-}
-
-/* Voronoi / Worley like */
-ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
-{
-	float da[4];
-	float3 pa[4];
-	NodeDistanceMetric distance_metric = NODE_VORONOI_DISTANCE_SQUARED;
-
-	/* returns distances in da and point coords in pa */
-	int xx, yy, zz, xi, yi, zi;
-
-	xi = floor_to_int(p.x);
-	yi = floor_to_int(p.y);
-	zi = floor_to_int(p.z);
-
-	da[0] = 1e10f;
-	da[1] = 1e10f;
-	da[2] = 1e10f;
-	da[3] = 1e10f;
-
-	pa[0] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[1] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[2] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[3] = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(xx = xi-1; xx <= xi+1; xx++) {
-		for(yy = yi-1; yy <= yi+1; yy++) {
-			for(zz = zi-1; zz <= zi+1; zz++) {
-				float3 ip = make_float3((float)xx, (float)yy, (float)zz);
-				float3 vp = cellnoise_color(ip);
-				float3 pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
-
-				vp += ip;
-
-				if(d < da[0]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = da[0];
-					da[0] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = pa[0];
-					pa[0] = vp;
-				}
-				else if(d < da[1]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = vp;
-				}
-				else if(d < da[2]) {
-					da[3] = da[2];
-					da[2] = d;
-
-					pa[3] = pa[2];
-					pa[2] = vp;
-				}
-				else if(d < da[3]) {
-					da[3] = d;
-					pa[3] = vp;
-				}
-			}
-		}
-	}
-
-	float4 result = make_float4(pa[n1].x, pa[n1].y, pa[n1].z, da[n1]);
-
-	if(n2 != -1)
-		result = make_float4(pa[n2].x, pa[n2].y, pa[n2].z, da[n2]) - result;
-
-	return result;
-}
-#endif
-
-ccl_device float voronoi_F1_distance(float3 p)
-{
-	/* returns squared distance in da */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#else
-	ssef vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#endif
-
-	return da;
-}
-
-ccl_device float3 voronoi_F1_color(float3 p)
-{
-	/* returns color of the nearest point */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	float3 pa;
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	return cellnoise_color(pa);
-#else
-	ssef pa, vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	ssef color = cellnoise_color(pa);
-	return (float3 &)color;
-#endif
-}
-
-#if 0
-ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
-ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
-ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
-ccl_device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; }
-ccl_device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; }
-
-ccl_device float voronoi_Cr(float3 p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0f*voronoi_F1F2(p);
-	return (t > 1.0f)? 1.0f: t;
-}
-
-ccl_device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; }
-ccl_device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; }
-ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
-ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
-ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
-ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
-#endif
-
-/* Noise Bases */
-
-ccl_device float noise_basis(float3 p, NodeNoiseBasis basis)
-{
-	/* Only Perlin enabled for now, others break CUDA compile by making kernel
-	 * too big, with compile using > 4GB, due to everything being inlined. */
-
-#if 0
-	if(basis == NODE_NOISE_PERLIN)
-#endif
-		return noise(p);
-#if 0
-	if(basis == NODE_NOISE_VORONOI_F1)
-		return voronoi_F1S(p);
-	if(basis == NODE_NOISE_VORONOI_F2)
-		return voronoi_F2S(p);
-	if(basis == NODE_NOISE_VORONOI_F3)
-		return voronoi_F3S(p);
-	if(basis == NODE_NOISE_VORONOI_F4)
-		return voronoi_F4S(p);
-	if(basis == NODE_NOISE_VORONOI_F2_F1)
-		return voronoi_F1F2S(p);
-	if(basis == NODE_NOISE_VORONOI_CRACKLE)
-		return voronoi_CrS(p);
-	if(basis == NODE_NOISE_CELL_NOISE)
-		return cellnoise(p);
-	
-	return 0.0f;
-#endif
-}
-
-/* Soft/Hard Noise */
-
-ccl_device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard)? fabsf(2.0f*t - 1.0f): t;
-}
-
 /* Turbulence */
 
-ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard)
+ccl_device_noinline float noise_turbulence(float3 p, float octaves, int hard)
 {
 	float fscale = 1.0f;
 	float amp = 1.0f;
@@ -281,7 +29,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	n = float_to_int(octaves);
 
 	for(i = 0; i <= n; i++) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
@@ -294,7 +42,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	float rmd = octaves - floorf(octaves);
 
 	if(rmd != 0.0f) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index cd38ce4ba9b..33aa5e7c51c 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_TYPES_H__
@@ -28,6 +28,29 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
+/* Known frequencies of used nodes, used for selective nodes compilation
+ * in the kernel. Currently only affects split OpenCL kernel.
+ *
+ * Keep as defines so it's easy to check which nodes are to be compiled
+ * from preprocessor.
+ *
+ * Lower the number of group more often the node is used.
+ */
+#define NODE_GROUP_LEVEL_0    0
+#define NODE_GROUP_LEVEL_1    1
+#define NODE_GROUP_LEVEL_2    2
+#define NODE_GROUP_LEVEL_3    3
+#define NODE_GROUP_LEVEL_MAX  NODE_GROUP_LEVEL_3
+
+#define NODE_FEATURE_VOLUME     (1 << 0)
+#define NODE_FEATURE_HAIR       (1 << 1)
+#define NODE_FEATURE_BUMP       (1 << 2)
+/* TODO(sergey): Consider using something like ((uint)(-1)).
+ * Need to check carefully operand types around usage of this
+ * define first.
+ */
+#define NODE_FEATURE_ALL        (NODE_FEATURE_VOLUME|NODE_FEATURE_HAIR|NODE_FEATURE_BUMP)
+
 typedef enum NodeType {
 	NODE_END = 0,
 	NODE_CLOSURE_BSDF,
@@ -103,8 +126,8 @@ typedef enum NodeType {
 	NODE_NORMAL_MAP,
 	NODE_HAIR_INFO,
 	NODE_UVMAP,
+	NODE_TEX_VOXEL,
 
-	/* Camera ray nodes. */
 	NODE_CAMERA_PATH_ATTRIBUTE,
 	NODE_CAMERA_SAMPLE_PERSPECTIVE,
 	NODE_CAMERA_RAY_OUTPUT,
@@ -262,27 +285,6 @@ typedef enum NodeConvert {
 	NODE_CONVERT_IV
 } NodeConvert;
 
-typedef enum NodeDistanceMetric {
-	NODE_VORONOI_DISTANCE_SQUARED,
-	NODE_VORONOI_ACTUAL_DISTANCE,
-	NODE_VORONOI_MANHATTAN,
-	NODE_VORONOI_CHEBYCHEV,
-	NODE_VORONOI_MINKOVSKY_H,
-	NODE_VORONOI_MINKOVSKY_4,
-	NODE_VORONOI_MINKOVSKY
-} NodeDistanceMetric;
-
-typedef enum NodeNoiseBasis {
-	NODE_NOISE_PERLIN,
-	NODE_NOISE_VORONOI_F1,
-	NODE_NOISE_VORONOI_F2,
-	NODE_NOISE_VORONOI_F3,
-	NODE_NOISE_VORONOI_F4,
-	NODE_NOISE_VORONOI_F2_F1,
-	NODE_NOISE_VORONOI_CRACKLE,
-	NODE_NOISE_CELL_NOISE
-} NodeNoiseBasis;
-
 typedef enum NodeMusgraveType {
 	NODE_MUSGRAVE_MULTIFRACTAL,
 	NODE_MUSGRAVE_FBM,
@@ -340,6 +342,24 @@ typedef enum NodeNormalMapSpace {
 	NODE_NORMAL_MAP_BLENDER_WORLD,
 } NodeNormalMapSpace;
 
+typedef enum NodeImageProjection {
+	NODE_IMAGE_PROJ_FLAT   = 0,
+	NODE_IMAGE_PROJ_BOX    = 1,
+	NODE_IMAGE_PROJ_SPHERE = 2,
+	NODE_IMAGE_PROJ_TUBE   = 3,
+} NodeImageProjection;
+
+typedef enum NodeBumpOffset {
+	NODE_BUMP_OFFSET_CENTER,
+	NODE_BUMP_OFFSET_DX,
+	NODE_BUMP_OFFSET_DY,
+} NodeBumpOffset;
+
+typedef enum NodeTexVoxelSpace {
+	NODE_TEX_VOXEL_SPACE_OBJECT = 0,
+	NODE_TEX_VOXEL_SPACE_WORLD  = 1,
+} NodeTexVoxelSpace;
+
 typedef enum ShaderType {
 	SHADER_TYPE_SURFACE,
 	SHADER_TYPE_VOLUME,
@@ -355,7 +375,6 @@ typedef enum ClosureType {
 	/* Diffuse */
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
-	CLOSURE_BSDF_WESTIN_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
@@ -369,7 +388,6 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
-	CLOSURE_BSDF_WESTIN_BACKSCATTER_ID,
 	CLOSURE_BSDF_PHONG_RAMP_ID,
 	CLOSURE_BSDF_GLOSSY_TOON_ID,
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
@@ -428,6 +446,7 @@ typedef enum NodePathAttribute {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 7beed065288..c1c2b539df3 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 61d33aeb8cf..4c32130d06d 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (sd->object != OBJECT_NONE);
+	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
@@ -45,7 +45,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 			else
 				in = transform_point(&tfm, in);
 		}
-		else if (to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
+		else if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
 			if(is_direction)
 				object_inverse_dir_transform(kg, sd, &in);
 			else
@@ -54,7 +54,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	}
 	
 	/* From camera */
-	else if (from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
+	else if(from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
 		if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_WORLD || to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT) {
 			tfm = kernel_data.cam.cameratoworld;
 			if(is_direction)
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index 083a2f30e06..d612d7e973f 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -11,13 +11,99 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Voronoi */
 
+ccl_device float voronoi_F1_distance(float3 p)
+{
+	/* returns squared distance in da */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#else
+	ssef vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#endif
+
+	return da;
+}
+
+ccl_device float3 voronoi_F1_color(float3 p)
+{
+	/* returns color of the nearest point */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	float3 pa;
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	return cellnoise_color(pa);
+#else
+	ssef pa, vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	ssef color = cellnoise_color(pa);
+	return (float3 &)color;
+#endif
+}
+
 ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p)
 {
 	if(coloring == NODE_VORONOI_INTENSITY) {
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
new file mode 100644
index 00000000000..bbb687dfce5
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if !defined(__KERNEL_GPU__)
+
+/* TODO(sergey): Think of making it more generic volume-type attribute
+ * sampler.
+ */
+ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   int *offset)
+{
+	uint co_offset, density_out_offset, color_out_offset, space;
+	decode_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
+	int id = node.y;
+	float3 co = stack_load_float3(stack, co_offset);
+	if(space == NODE_TEX_VOXEL_SPACE_OBJECT) {
+		co = volume_normalized_position(kg, sd, co);
+	}
+	else {
+		kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
+		Transform tfm;
+		tfm.x = read_node_float(kg, offset);
+		tfm.y = read_node_float(kg, offset);
+		tfm.z = read_node_float(kg, offset);
+		tfm.w = read_node_float(kg, offset);
+		co = transform_point(&tfm, co);
+	}
+	if(co.x < 0.0f || co.y < 0.0f || co.z < 0.0f ||
+	   co.x > 1.0f || co.y > 1.0f || co.z > 1.0f)
+	{
+		if (stack_valid(density_out_offset))
+			stack_store_float(stack, density_out_offset, 0.0f);
+		if (stack_valid(color_out_offset))
+			stack_store_float3(stack, color_out_offset, make_float3(0.0f, 0.0f, 0.0f));
+		return;
+	}
+	float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+	if (stack_valid(density_out_offset))
+		stack_store_float(stack, density_out_offset, r.w);
+	if (stack_valid(color_out_offset))
+		stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+}
+
+#endif  /* !defined(__KERNEL_GPU__) */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 7f9081539a4..6eaddaf301c 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,7 +28,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, float3 p, float detail, fl
 		n = len(p) * 20.0f;
 	
 	if(distortion != 0.0f)
-		n += distortion * noise_turbulence(p*dscale, NODE_NOISE_PERLIN, detail, 0);
+		n += distortion * noise_turbulence(p*dscale, detail, 0);
 
 	return 0.5f + 0.5f * sinf(n);
 }
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index 9e57c470c0f..57030f3979d 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -77,7 +77,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	int i = float_to_int(ii);
 	float3 color;
 	
-	if (i < 0 || i >= 80) {
+	if(i < 0 || i >= 80) {
 		color = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else {
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 660e6e2ca47..30ccd523add 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -34,20 +34,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Wireframe Node */
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_size, uint out_fac, uint use_pixel_size)
+ccl_device float wireframe(KernelGlobals *kg,
+                           ShaderData *sd,
+                           float size,
+                           int pixel_size,
+                           float3 *P)
 {
-	/* Input Data */
-	float size = stack_load_float(stack, in_size);
-	int pixel_size = (int)use_pixel_size;
-	
-	/* Output */
-	float f = 0.0f;
-
-	/* Calculate wireframe */
 #ifdef __HAIR__
-	if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if (sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -55,45 +51,85 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
 
 		/* Triangles */
 		int np = 3;
-		
-		if(sd->type & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, sd->prim, Co);
+
+		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
 		else
-			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
+			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
 		}
-		
+
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
-			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
-		
+
 		// Use half the width as the neighbor face will render the
 		// other half. And take the square for fast comparison
 		pixelwidth *= 0.5f * size;
 		pixelwidth *= pixelwidth;
-		for (int i = 0; i < np; i++) {
+		for(int i = 0; i < np; i++) {
 			int i2 = i ? i - 1 : np - 1;
-			float3 dir = sd->P - Co[i];
+			float3 dir = *P - Co[i];
 			float3 edge = Co[i] - Co[i2];
 			float3 crs = cross(edge, dir);
 			// At this point dot(crs, crs) / dot(edge, edge) is
 			// the square of area / length(edge) == square of the
 			// distance to the edge.
-			if (dot(crs, crs) < (dot(edge, edge) * pixelwidth))
-				f = 1.0f;
+			if(dot(crs, crs) < (dot(edge, edge) * pixelwidth))
+				return 1.0f;
 		}
 	}
-	
-	if (stack_valid(out_fac))
+	return 0.0f;
+}
+
+ccl_device void svm_node_wireframe(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node)
+{
+	uint in_size = node.y;
+	uint out_fac = node.z;
+	uint use_pixel_size, bump_offset;
+	decode_node_uchar4(node.w, &use_pixel_size, &bump_offset, NULL, NULL);
+
+	/* Input Data */
+	float size = stack_load_float(stack, in_size);
+	int pixel_size = (int)use_pixel_size;
+
+	/* Calculate wireframe */
+#ifdef __SPLIT_KERNEL__
+	/* TODO(sergey): This is because sd is actually a global space,
+	 * which makes it difficult to re-use same wireframe() function.
+	 *
+	 * With OpenCL 2.0 it's possible to avoid this change, but for until
+	 * then we'll be living with such an exception.
+	 */
+	float3 P = ccl_fetch(sd, P);
+	float f = wireframe(kg, sd, size, pixel_size, &P);
+#else
+	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+#endif
+
+	/* TODO(sergey): Think of faster way to calculate derivatives. */
+	if(bump_offset == NODE_BUMP_OFFSET_DX) {
+		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+	}
+	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
+		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+	}
+
+	if(stack_valid(out_fac))
 		stack_store_float(stack, out_fac, f);
 }
 
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index e4f847a783a..04c7ee724e3 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -7,17 +7,17 @@ set(INC
 	../kernel/osl
 	../bvh
 	../util
+	../../glew-mx
 )
 
 set(INC_SYS
-	${GLEW_INCLUDE_PATH}
+	${GLEW_INCLUDE_DIR}
 )
 
 set(SRC
 	attribute.cpp
 	background.cpp
 	bake.cpp
-	blackbody.cpp
 	buffers.cpp
 	camera.cpp
 	camera_nodes.cpp
@@ -47,7 +47,6 @@ set(SRC_HEADERS
 	attribute.h
 	bake.h
 	background.h
-	blackbody.h
 	buffers.h
 	camera.h
 	camera_nodes.h
@@ -78,5 +77,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_render ${SRC} ${SRC_HEADERS})
+add_definitions(${GL_DEFINITIONS})
 
+add_library(cycles_render ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index 8abf869a775..6e94459da55 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "image.h"
@@ -52,7 +52,7 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 
 void Attribute::reserve(int numverts, int numtris, int numsteps, int numcurves, int numkeys, bool resize)
 {
-	if (resize) {
+	if(resize) {
 		buffer.resize(buffer_size(numverts, numtris, numsteps, numcurves, numkeys), 0);
 	}
 	else {
@@ -230,6 +230,8 @@ const char *Attribute::standard_name(AttributeStandard std)
 			return "heat";
 		case ATTR_STD_VOLUME_VELOCITY:
 			return "velocity";
+		case ATTR_STD_POINTINESS:
+			return "pointiness";
 		case ATTR_STD_NOT_FOUND:
 		case ATTR_STD_NONE:
 		case ATTR_STD_NUM:
@@ -375,6 +377,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_VOLUME_VELOCITY:
 				attr = add(name, TypeDesc::TypeVector, ATTR_ELEMENT_VOXEL);
 				break;
+			case ATTR_STD_POINTINESS:
+				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VERTEX);
+				break;
 			default:
 				assert(0);
 				break;
@@ -395,6 +400,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_GENERATED_TRANSFORM:
 				attr = add(name, TypeDesc::TypeMatrix, ATTR_ELEMENT_MESH);
 				break;
+			case ATTR_STD_POINTINESS:
+				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VERTEX);
+				break;
 			default:
 				assert(0);
 				break;
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index f5227ebde52..bbc6cf7f65f 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __ATTRIBUTE_H__
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index 3926ecb99d6..5fd7bd8f16f 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "background.h"
@@ -72,21 +72,28 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	else
 		kbackground->volume_shader = SHADER_NONE;
 
-	if(!(visibility & PATH_RAY_DIFFUSE))
-		kbackground->surface_shader |= SHADER_EXCLUDE_DIFFUSE;
-	if(!(visibility & PATH_RAY_GLOSSY))
-		kbackground->surface_shader |= SHADER_EXCLUDE_GLOSSY;
-	if(!(visibility & PATH_RAY_TRANSMIT))
-		kbackground->surface_shader |= SHADER_EXCLUDE_TRANSMIT;
-	if(!(visibility & PATH_RAY_VOLUME_SCATTER))
-		kbackground->surface_shader |= SHADER_EXCLUDE_SCATTER;
-	if(!(visibility & PATH_RAY_CAMERA))
-		kbackground->surface_shader |= SHADER_EXCLUDE_CAMERA;
+	/* No background node, make world shader invisible to all rays, to skip evaluation in kernel. */
+	if(scene->shaders[shader]->graph->nodes.size() <= 1) {
+		kbackground->surface_shader |= SHADER_EXCLUDE_ANY;
+	}
+	/* Background present, check visibilities */
+	else {
+		if(!(visibility & PATH_RAY_DIFFUSE))
+			kbackground->surface_shader |= SHADER_EXCLUDE_DIFFUSE;
+		if(!(visibility & PATH_RAY_GLOSSY))
+			kbackground->surface_shader |= SHADER_EXCLUDE_GLOSSY;
+		if(!(visibility & PATH_RAY_TRANSMIT))
+			kbackground->surface_shader |= SHADER_EXCLUDE_TRANSMIT;
+		if(!(visibility & PATH_RAY_VOLUME_SCATTER))
+			kbackground->surface_shader |= SHADER_EXCLUDE_SCATTER;
+		if(!(visibility & PATH_RAY_CAMERA))
+			kbackground->surface_shader |= SHADER_EXCLUDE_CAMERA;
+	}
 
 	need_update = false;
 }
 
-void Background::device_free(Device *device, DeviceScene *dscene)
+void Background::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 }
 
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index cf627862513..26a727291ee 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BACKGROUND_H__
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 5723a22dd84..4bbac0f91d1 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "bake.h"
@@ -55,6 +55,11 @@ void BakeData::set(int i, int prim, float uv[2], float dudx, float dudy, float d
 	m_dvdy[i] = dvdy;
 }
 
+void BakeData::set_null(int i)
+{
+	m_primitive[i] = -1;
+}
+
 int BakeData::object()
 {
 	return m_object;
@@ -221,7 +226,10 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 	return true;
 }
 
-void BakeManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void BakeManager::device_update(Device * /*device*/,
+                                DeviceScene * /*dscene*/,
+                                Scene * /*scene*/,
+                                Progress& progress)
 {
 	if(!need_update)
 		return;
@@ -231,7 +239,7 @@ void BakeManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	need_update = false;
 }
 
-void BakeManager::device_free(Device *device, DeviceScene *dscene)
+void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 }
 
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 186fbbeea4d..14d975a4b4e 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BAKE_H__
@@ -31,6 +31,7 @@ public:
 	~BakeData();
 
 	void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy);
+	void set_null(int i);
 	int object();
 	size_t size();
 	uint4 data(int i);
diff --git a/intern/cycles/render/blackbody.cpp b/intern/cycles/render/blackbody.cpp
deleted file mode 100644
index 89af714e8ec..00000000000
--- a/intern/cycles/render/blackbody.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2013, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "blackbody.h"
-#include "util_color.h"
-#include "util_math.h"
-
-#include "kernel_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-vector<float> blackbody_table()
-{
-	/* quoted from OSLs opcolor.cpp
-	In order to speed up the blackbody computation, we have a table
-	storing the precomputed BB values for a range of temperatures.  Less
-	than BB_DRAPER always returns 0.  Greater than BB_MAX_TABLE_RANGE
-	does the full computation, we think it'll be rare to inquire higher
-	temperatures.
-
-	Since the bb function is so nonlinear, we actually space the table
-	entries nonlinearly, with the relationship between the table index i
-	and the temperature T as follows:
-	i = ((T-Draper)/spacing)^(1/xpower)
-	T = pow(i, xpower) * spacing + Draper
-	And furthermore, we store in the table the true value raised ^(1/5).
-	I tuned this a bit, and with the current values we can have all
-	blackbody results accurate to within 0.1% with a table size of 317
-	(about 5 KB of data).
-	*/
-
-	const float cie_colour_match[81][3] = {
-		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
-		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
-		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
-		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
-		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
-		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
-		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
-		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
-		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
-		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
-		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
-		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
-		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
-		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
-		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
-		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
-		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
-		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
-		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
-		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
-		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
-		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
-		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
-		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
-		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
-		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
-		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
-	};
-
-	const double c1 = 3.74183e-16; // 2*pi*h*c^2, W*m^2
-	const double c2 = 1.4388e-2;   // h*c/k, m*K
-								   // h is Planck's const, k is Boltzmann's
-	const float dlambda = 5.0f * 1e-9f;  // in meters
-
-	/* Blackbody table from 800 to 12k Kelvin (319 entries (317+2 offset) * 3) */
-	vector<float> blackbody_table(956);
-
-	float X, Y, Z;
-
-	/* ToDo: bring this back to what OSL does with the lastTemperature limit ? */
-	for (int i = 0;  i <= 317;  ++i) {
-		double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPPER;
-		X = 0;
-		Y = 0;
-		Z = 0;
-
-		/* from OSL "spectrum_to_XYZ" */
-		for (int n = 0; n < 81; ++n) {
-			float lambda = 380.0f + 5.0f * n;
-			double wlm = lambda * 1e-9f;   // Wavelength in meters
-			// N.B. spec_intens returns result in W/m^2 but it's a differential,
-			// needs to be scaled by dlambda!
-			float spec_intens = float((c1 * pow(wlm, -5.0)) / (exp(c2 / (wlm * Temperature)) -1.0));
-			float Me = spec_intens * dlambda;
-
-			X += Me * cie_colour_match[n][0];
-			Y += Me * cie_colour_match[n][1];
-			Z += Me * cie_colour_match[n][2];
-		}
-		
-		/* Convert from xyz color space */
-		float3 col = xyz_to_rgb(X, Y, Z);
-
-		/* Clamp to zero if values are smaller */
-		col = max(col, make_float3(0.0f, 0.0f, 0.0f));
-
-		col.x = powf(col.x, 1.0f / BB_TABLE_YPOWER);
-		col.y = powf(col.y, 1.0f / BB_TABLE_YPOWER);
-		col.z = powf(col.z, 1.0f / BB_TABLE_YPOWER);
-
-		/* Store in table in RRRGGGBBB format */
-		blackbody_table[i] = col.x;
-		blackbody_table[i+319*1] = col.y;
-		blackbody_table[i+319*2] = col.z;	
-	}
-
-	return blackbody_table;
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fc65922fc87..fab3f701757 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -187,9 +187,23 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 			else if(type == PASS_MIST) {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
 					float f = *in;
-					pixels[0] = clamp(f*scale_exposure, 0.0f, 1.0f);
+					pixels[0] = saturate(f*scale_exposure);
 				}
 			}
+#ifdef WITH_CYCLES_DEBUG
+			else if(type == PASS_BVH_TRAVERSAL_STEPS) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f;
+				}
+			}
+			else if(type == PASS_RAY_BOUNCES) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f;
+				}
+			}
+#endif
 			else {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
 					float f = *in;
@@ -290,7 +304,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pixels[2] = f.z*scale_exposure;
 
 					/* clamp since alpha might be > 1.0 due to russian roulette */
-					pixels[3] = clamp(f.w*scale, 0.0f, 1.0f);
+					pixels[3] = saturate(f.w*scale);
 				}
 			}
 		}
@@ -361,13 +375,9 @@ void DisplayBuffer::draw_set(int width, int height)
 void DisplayBuffer::draw(Device *device, const DeviceDrawParams& draw_params)
 {
 	if(draw_width != 0 && draw_height != 0) {
-		glPushMatrix();
-		glTranslatef(params.full_x, params.full_y, 0.0f);
 		device_memory& rgba = rgba_data();
 
-		device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent, draw_params);
-
-		glPopMatrix();
+		device->draw_pixels(rgba, 0, draw_width, draw_height, params.full_x, params.full_y, params.width, params.height, transparent, draw_params);
 	}
 }
 
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 27ab20bbafd..4fa1c51d821 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BUFFERS_H__
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 71442d490ee..3b42cfa3c7b 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -28,32 +28,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-namespace {
-
-bool object_has_volume(Scene *scene, Object *object)
-{
-	Mesh *mesh = object->mesh;
-	foreach(uint shader, mesh->used_shaders) {
-		if(scene->shaders[shader]->has_volume) {
-			return true;
-		}
-	}
-	return false;
-}
-
-bool scene_has_volume(Scene *scene)
-{
-	for(size_t i = 0; i < scene->objects.size(); ++i) {
-		Object *object = scene->objects[i];
-		if(object_has_volume(scene, object)) {
-			return true;
-		}
-	}
-	return false;
-}
-
-}  // namespace
-
 Camera::Camera()
 {
 	shuttertime = 1.0f;
@@ -68,6 +42,7 @@ Camera::Camera()
 	motion.pre = transform_identity();
 	motion.post = transform_identity();
 	use_motion = false;
+	use_perspective_motion = false;
 
 	aperture_ratio = 1.0f;
 
@@ -75,7 +50,12 @@ Camera::Camera()
 	panorama_type = PANORAMA_EQUIRECTANGULAR;
 	fisheye_fov = M_PI_F;
 	fisheye_lens = 10.5f;
+	latitude_min = -M_PI_2_F;
+	latitude_max = M_PI_2_F;
+	longitude_min = -M_PI_F;
+	longitude_max = M_PI_F;
 	fov = M_PI_4_F;
+	fov_pre = fov_post = fov;
 
 	sensorwidth = 0.036f;
 	sensorheight = 0.024f;
@@ -104,6 +84,7 @@ Camera::Camera()
 
 	need_update = true;
 	need_device_update = true;
+	need_flags_update = true;
 	previous_need_motion = -1;
 
 	graph = NULL;
@@ -116,19 +97,26 @@ Camera::~Camera()
 
 void Camera::compute_auto_viewplane()
 {
-	float aspect = (float)width/(float)height;
-
-	if(width >= height) {
-		viewplane.left = -aspect;
-		viewplane.right = aspect;
-		viewplane.bottom = -1.0f;
+	if(type == CAMERA_PANORAMA) {
+		viewplane.left = 0.0f;
+		viewplane.right = 1.0f;
+		viewplane.bottom = 0.0f;
 		viewplane.top = 1.0f;
 	}
 	else {
-		viewplane.left = -1.0f;
-		viewplane.right = 1.0f;
-		viewplane.bottom = -1.0f/aspect;
-		viewplane.top = 1.0f/aspect;
+		float aspect = (float)width/(float)height;
+		if(width >= height) {
+			viewplane.left = -aspect;
+			viewplane.right = aspect;
+			viewplane.bottom = -1.0f;
+			viewplane.top = 1.0f;
+		}
+		else {
+			viewplane.left = -1.0f;
+			viewplane.right = 1.0f;
+			viewplane.bottom = -1.0f/aspect;
+			viewplane.top = 1.0f/aspect;
+		}
 	}
 }
 
@@ -136,16 +124,17 @@ void Camera::update()
 {
 	if(!need_update)
 		return;
-	
+
+	/* Full viewport to camera border in the viewport. */
+	Transform fulltoborder = transform_from_viewplane(viewport_camera_border);
+	Transform bordertofull = transform_inverse(fulltoborder);
+
 	/* ndc to raster */
 	Transform screentocamera;
-	Transform ndctoraster = transform_scale(width, height, 1.0f);
+	Transform ndctoraster = transform_scale(width, height, 1.0f) * bordertofull;
 
 	/* raster to screen */
-	Transform screentondc = 
-		transform_scale(1.0f/(viewplane.right - viewplane.left),
-		                1.0f/(viewplane.top - viewplane.bottom), 1.0f) *
-		transform_translate(-viewplane.left, -viewplane.bottom, 0.0f);
+	Transform screentondc = fulltoborder * transform_from_viewplane(viewplane);
 
 	Transform screentoraster = ndctoraster * screentondc;
 	Transform rastertoscreen = transform_inverse(screentoraster);
@@ -188,15 +177,33 @@ void Camera::update()
 		     transform_perspective(&rastertocamera, make_float3(0, 0, 0));
 	}
 	else {
-		dx = make_float3(0, 0, 0);
-		dy = make_float3(0, 0, 0);
+		dx = make_float3(0.0f, 0.0f, 0.0f);
+		dy = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	dx = transform_direction(&cameratoworld, dx);
 	dy = transform_direction(&cameratoworld, dy);
 
+	/* TODO(sergey): Support other types of camera. */
+	if(type == CAMERA_PERSPECTIVE) {
+		/* TODO(sergey): Move to an utility function and de-duplicate with
+		 * calculation above.
+		 */
+		Transform screentocamera_pre =
+		        transform_inverse(transform_perspective(fov_pre,
+		                                                nearclip,
+		                                                farclip));
+		Transform screentocamera_post =
+		        transform_inverse(transform_perspective(fov_post,
+		                                                nearclip,
+		                                                farclip));
+		perspective_motion.pre = screentocamera_pre * rastertoscreen;
+		perspective_motion.post = screentocamera_post * rastertoscreen;
+	}
+
 	need_update = false;
 	need_device_update = true;
+	need_flags_update = true;
 }
 
 void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -205,7 +212,7 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
 	update();
 
-	if (previous_need_motion != need_motion) {
+	if(previous_need_motion != need_motion) {
 		/* scene's motion model could have been changed since previous device
 		 * camera update this could happen for example in case when one render
 		 * layer has got motion pass and another not */
@@ -229,8 +236,10 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
 	/* camera motion */
 	kcam->have_motion = 0;
+	kcam->have_perspective_motion = 0;
 
 	if(need_motion == Scene::MOTION_PASS) {
+		/* TODO(sergey): Support perspective (zoom, fov) motion. */
 		if(type == CAMERA_PANORAMA) {
 			if(use_motion) {
 				kcam->motion.pre = transform_inverse(motion.pre);
@@ -258,6 +267,10 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 			transform_motion_decompose((DecompMotionTransform*)&kcam->motion, &motion, &matrix);
 			kcam->have_motion = 1;
 		}
+		if(use_perspective_motion) {
+			kcam->perspective_motion = perspective_motion;
+			kcam->have_perspective_motion = 1;
+		}
 	}
 #endif
 
@@ -284,6 +297,8 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kcam->panorama_type = panorama_type;
 	kcam->fisheye_fov = fisheye_fov;
 	kcam->fisheye_lens = fisheye_lens;
+	kcam->equirectangular_range = make_float4(longitude_min - longitude_max, -longitude_min,
+	                                          latitude_min -  latitude_max, -latitude_min + M_PI_2_F);
 
 	/* sensor size */
 	kcam->sensorwidth = sensorwidth;
@@ -302,45 +317,13 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kcam->nearclip = nearclip;
 	kcam->cliplength = (farclip == FLT_MAX)? FLT_MAX: farclip - nearclip;
 
-	/* focal length */
-	kcam->focal_length = focal_length;
+	/* Camera in volume. */
+	kcam->is_inside_volume = 0;
 
-	need_device_update = false;
 	previous_need_motion = need_motion;
 
-	/* Camera in volume. */
-	kcam->is_inside_volume = 0;
-	if(use_camera_in_volume) {
-		if(type == CAMERA_PANORAMA) {
-			/* It's not clear how to do viewplace->object intersection for
-			 * panoramic cameras, for now let's just check for whether there
-			 * are any volumes in the scene.
-			 */
-			kcam->is_inside_volume = scene_has_volume(scene);
-		}
-		else {
-			/* TODO(sergey): Whole bunch of stuff here actually:
-			 * - We do rather stupid check with object AABB to camera viewplane
-			 *   AABB intersection, which is quite fast to perform, but which
-			 *   could give some false-positives checks here, More grained check
-			 *   would help avoiding time wasted n the kernel to initialize the
-			 *   volume stack.
-			 * - We could cache has_volume in the cache, would save quite a few
-			 *   CPU ticks when having loads of instanced meshes.
-			 */
-			BoundBox viewplane_boundbox = viewplane_bounds_get();
-			for(size_t i = 0; i < scene->objects.size(); ++i) {
-				Object *object = scene->objects[i];
-				if(object_has_volume(scene, object)) {
-					if(viewplane_boundbox.intersects(object->bounds)) {
-						/* TODO(sergey): Consider adding more grained check. */
-						kcam->is_inside_volume = 1;
-						break;
-					}
-				}
-			}
-		}
-	}
+	/* focal length */
+	kcam->focal_length = focal_length;
 
 	/* TODO(sergey): Make sure shaders are fully synced at this point. */
 	if(graph != NULL) {
@@ -351,7 +334,30 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	}
 }
 
-void Camera::device_free(Device *device, DeviceScene *dscene)
+void Camera::device_update_volume(Device * /*device*/,
+                                  DeviceScene *dscene,
+                                  Scene *scene)
+{
+	if(!need_device_update && !need_flags_update) {
+		return;
+	}
+	KernelCamera *kcam = &dscene->data.cam;
+	BoundBox viewplane_boundbox = viewplane_bounds_get();
+	for(size_t i = 0; i < scene->objects.size(); ++i) {
+		Object *object = scene->objects[i];
+		if(object->mesh->has_volume &&
+		   viewplane_boundbox.intersects(object->bounds))
+		{
+			/* TODO(sergey): Consider adding more grained check. */
+			kcam->is_inside_volume = 1;
+			break;
+		}
+	}
+	need_device_update = false;
+	need_flags_update = false;
+}
+
+void Camera::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 	/* nothing to free, only writing to constant memory */
 }
@@ -379,7 +385,11 @@ bool Camera::modified(const Camera& cam)
 		(aperture_ratio == cam.aperture_ratio) &&
 		(panorama_type == cam.panorama_type) &&
 		(fisheye_fov == cam.fisheye_fov) &&
-		(fisheye_lens == cam.fisheye_lens));
+		(fisheye_lens == cam.fisheye_lens) &&
+		(latitude_min == cam.latitude_min) &&
+		(latitude_max == cam.latitude_max) &&
+		(longitude_min == cam.longitude_min) &&
+		(longitude_max == cam.longitude_max));
 }
 
 bool Camera::motion_modified(const Camera& cam)
@@ -399,6 +409,7 @@ float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
 	if(type == CAMERA_PERSPECTIVE) {
 		D = transform_perspective(&rastertocamera,
 		                          make_float3(raster_x, raster_y, 0.0f));
+		float3 Pclip = normalize(D);
 		P = make_float3(0.0f, 0.0f, 0.0f);
 		/* TODO(sergey): Aperture support? */
 		P = transform_point(&cameratoworld, P);
@@ -407,9 +418,9 @@ float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
 		 * be mistakes in here, currently leading to wrong camera-in-volume
 		 * detection.
 		 */
-		P += nearclip * D;
+		P += nearclip * D / Pclip.z;
 	}
-	else if (type == CAMERA_ORTHOGRAPHIC) {
+	else if(type == CAMERA_ORTHOGRAPHIC) {
 		D = make_float3(0.0f, 0.0f, 1.0f);
 		/* TODO(sergey): Aperture support? */
 		P = transform_perspective(&rastertocamera,
@@ -425,21 +436,27 @@ float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
 
 BoundBox Camera::viewplane_bounds_get()
 {
-	assert(type != CAMERA_PANORAMA);
-
 	/* TODO(sergey): This is all rather stupid, but is there a way to perform
 	 * checks we need in a more clear and smart fasion?
 	 */
 	BoundBox bounds = BoundBox::empty;
-	bounds.grow(transform_raster_to_world(0.0f, 0.0f));
-	bounds.grow(transform_raster_to_world(0.0f, (float)height));
-	bounds.grow(transform_raster_to_world((float)width, (float)height));
-	bounds.grow(transform_raster_to_world((float)width, 0.0f));
-	if(type == CAMERA_PERSPECTIVE) {
-		/* Center point has the most distancei in local Z axis,
-		 * use it to construct bounding box/
-		 */
-		bounds.grow(transform_raster_to_world(0.5f*width, 0.5f*height));
+
+	if(type == CAMERA_PANORAMA) {
+		bounds.grow(make_float3(cameratoworld.x.w,
+		                        cameratoworld.y.w,
+		                        cameratoworld.z.w));
+	}
+	else {
+		bounds.grow(transform_raster_to_world(0.0f, 0.0f));
+		bounds.grow(transform_raster_to_world(0.0f, (float)height));
+		bounds.grow(transform_raster_to_world((float)width, (float)height));
+		bounds.grow(transform_raster_to_world((float)width, 0.0f));
+		if(type == CAMERA_PERSPECTIVE) {
+			/* Center point has the most distance in local Z axis,
+			 * use it to construct bounding box/
+			 */
+			bounds.grow(transform_raster_to_world(0.5f*width, 0.5f*height));
+		}
 	}
 	return bounds;
 }
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index ff93420bccc..cca5417740c 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __CAMERA_H__
@@ -55,6 +55,10 @@ public:
 	PanoramaType panorama_type;
 	float fisheye_fov;
 	float fisheye_lens;
+	float latitude_min;
+	float latitude_max;
+	float longitude_min;
+	float longitude_max;
 
 	/* anamorphic lens bokeh */
 	float aperture_ratio;
@@ -74,13 +78,16 @@ public:
 
 	/* border */
 	BoundBox2D border;
+	BoundBox2D viewport_camera_border;
 
 	/* transformation */
 	Transform matrix;
 
 	/* motion */
 	MotionTransform motion;
-	bool use_motion;
+	bool use_motion, use_perspective_motion;
+	float fov_pre, fov_post;
+	PerspectiveMotionTransform perspective_motion;
 
 	/* computed camera parameters */
 	Transform screentoworld;
@@ -102,15 +109,9 @@ public:
 	/* update */
 	bool need_update;
 	bool need_device_update;
+	bool need_flags_update;
 	int previous_need_motion;
 
-	/* Camera in volume. */
-	/* TODO(sergey): Get rid of this argument once
-	 * cameras in volume considered fast enough for
-	  * the regular kernel.
-	 */
-	bool use_camera_in_volume;
-
 	/* Camera ray nodes. */
 	CameraNodesGraph *graph;
 
@@ -120,19 +121,24 @@ public:
 	
 	void compute_auto_viewplane();
 
+	void set_graph(CameraNodesGraph *graph);
+
 	void update();
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene);
+	void device_update_volume(Device *device, DeviceScene *dscene, Scene *scene);
 	void device_free(Device *device, DeviceScene *dscene);
 
 	bool modified(const Camera& cam);
 	bool motion_modified(const Camera& cam);
 	void tag_update();
 
+	/* Public utility functions. */
 	BoundBox viewplane_bounds_get();
-	float3 transform_raster_to_world(float raster_x, float raster_y);
 
-	void set_graph(CameraNodesGraph *graph);
+private:
+	/* Private utility functions. */
+	float3 transform_raster_to_world(float raster_x, float raster_y);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index dc7665fe144..f671eb19cae 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -103,7 +103,10 @@ CurveSystemManager::~CurveSystemManager()
 {
 }
 
-void CurveSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void CurveSystemManager::device_update(Device *device,
+                                       DeviceScene *dscene,
+                                       Scene * /*scene*/,
+                                       Progress& progress)
 {
 	if(!need_update)
 		return;
@@ -144,7 +147,8 @@ void CurveSystemManager::device_update(Device *device, DeviceScene *dscene, Scen
 	need_update = false;
 }
 
-void CurveSystemManager::device_free(Device *device, DeviceScene *dscene)
+void CurveSystemManager::device_free(Device * /*device*/,
+                                     DeviceScene * /*dscene*/)
 {
 
 }
@@ -174,7 +178,7 @@ bool CurveSystemManager::modified_mesh(const CurveSystemManager& CurveSystemMana
 		use_curves == CurveSystemManager.use_curves);
 }
 
-void CurveSystemManager::tag_update(Scene *scene)
+void CurveSystemManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index 6fd0fff57fb..22ab5d05f8a 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __CURVES_H__
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 09973e8bc86..58080289633 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -144,8 +144,28 @@ void Pass::add(PassType type, vector<Pass>& passes)
 			pass.exposure = false;
 			break;
 		case PASS_LIGHT:
-			/* ignores */
+			/* This isn't a real pass, used by baking to see whether
+			 * light data is needed or not.
+			 *
+			 * Set components to 0 so pass sort below happens in a
+			 * determined way.
+			 */
+			pass.components = 0;
+			break;
+#ifdef WITH_CYCLES_DEBUG
+		case PASS_BVH_TRAVERSAL_STEPS:
+			pass.components = 1;
+			pass.exposure = false;
 			break;
+		case PASS_BVH_TRAVERSED_INSTANCES:
+			pass.components = 1;
+			pass.exposure = false;
+			break;
+		case PASS_RAY_BOUNCES:
+			pass.components = 1;
+			pass.exposure = false;
+			break;
+#endif
 	}
 
 	passes.push_back(pass);
@@ -181,7 +201,7 @@ bool Pass::contains(const vector<Pass>& passes, PassType type)
 
 /* Pixel Filter */
 
-static float filter_func_box(float v, float width)
+static float filter_func_box(float /*v*/, float /*width*/)
 {
 	return 1.0f;
 }
@@ -388,6 +408,19 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 			case PASS_LIGHT:
 				kfilm->use_light_pass = 1;
 				break;
+
+#ifdef WITH_CYCLES_DEBUG
+			case PASS_BVH_TRAVERSAL_STEPS:
+				kfilm->pass_bvh_traversal_steps = kfilm->pass_stride;
+				break;
+			case PASS_BVH_TRAVERSED_INSTANCES:
+				kfilm->pass_bvh_traversed_instances = kfilm->pass_stride;
+				break;
+			case PASS_RAY_BOUNCES:
+				kfilm->pass_ray_bounces = kfilm->pass_stride;
+				break;
+#endif
+
 			case PASS_NONE:
 				break;
 		}
@@ -411,7 +444,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	need_update = false;
 }
 
-void Film::device_free(Device *device, DeviceScene *dscene, Scene *scene)
+void Film::device_free(Device * /*device*/,
+                       DeviceScene * /*dscene*/,
+                       Scene *scene)
 {
 	if(filter_table_offset != TABLE_OFFSET_INVALID) {
 		scene->lookup_tables->remove_table(filter_table_offset);
@@ -446,7 +481,7 @@ void Film::tag_passes_update(Scene *scene, const vector<Pass>& passes_)
 	passes = passes_;
 }
 
-void Film::tag_update(Scene *scene)
+void Film::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index cc7183bfd95..e2cd63cc2ed 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __FILM_H__
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index d6c6d019b03..dad1e2bafa0 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "attribute.h"
@@ -34,7 +34,7 @@ ShaderInput::ShaderInput(ShaderNode *parent_, const char *name_, ShaderSocketTyp
 	name = name_;
 	type = type_;
 	link = NULL;
-	value = make_float3(0, 0, 0);
+	value = make_float3(0.0f, 0.0f, 0.0f);
 	stack_offset = SVM_STACK_INVALID;
 	default_value = NONE;
 	usage = USE_ALL;
@@ -228,6 +228,21 @@ void ShaderGraph::disconnect(ShaderInput *to)
 	from->links.erase(remove(from->links.begin(), from->links.end(), to), from->links.end());
 }
 
+void ShaderGraph::relink(vector<ShaderInput*> inputs, vector<ShaderInput*> outputs, ShaderOutput *output)
+{
+	/* Remove nodes and re-link if output isn't NULL. */
+	foreach(ShaderInput *sock, inputs) {
+		if(sock->link)
+			disconnect(sock);
+	}
+
+	foreach(ShaderInput *sock, outputs) {
+		disconnect(sock);
+		if(output)
+			connect(output, sock);
+	}
+}
+
 void ShaderGraph::finalize(bool do_bump, bool do_osl)
 {
 	/* before compiling, the shader graph may undergo a number of modifications.
@@ -322,6 +337,8 @@ void ShaderGraph::remove_unneeded_nodes()
 	vector<bool> removed(num_node_ids, false);
 	bool any_node_removed = false;
 
+	ShaderNode *geom = NULL;
+
 	/* find and unlink proxy nodes */
 	foreach(ShaderNode *node, nodes) {
 		if(node->special_type == SHADER_SPECIAL_TYPE_PROXY) {
@@ -375,6 +392,58 @@ void ShaderGraph::remove_unneeded_nodes()
 			removed[proxy->id] = true;
 			any_node_removed = true;
 		}
+		else if(node->special_type == SHADER_SPECIAL_TYPE_BACKGROUND) {
+			BackgroundNode *bg = static_cast<BackgroundNode*>(node);
+
+			if(bg->outputs[0]->links.size()) {
+				/* Black color or zero strength, remove node */
+				if((!bg->inputs[0]->link && bg->inputs[0]->value == make_float3(0.0f, 0.0f, 0.0f)) ||
+				   (!bg->inputs[1]->link && bg->inputs[1]->value.x == 0.0f)) {
+					vector<ShaderInput*> inputs = bg->outputs[0]->links;
+
+					relink(bg->inputs, inputs, NULL);
+					removed[bg->id] = true;
+					any_node_removed = true;
+				}
+			}
+		}
+		else if(node->special_type == SHADER_SPECIAL_TYPE_EMISSION) {
+			EmissionNode *em = static_cast<EmissionNode*>(node);
+
+			if(em->outputs[0]->links.size()) {
+				/* Black color or zero strength, remove node */
+				if((!em->inputs[0]->link && em->inputs[0]->value == make_float3(0.0f, 0.0f, 0.0f)) ||
+				   (!em->inputs[1]->link && em->inputs[1]->value.x == 0.0f)) {
+					vector<ShaderInput*> inputs = em->outputs[0]->links;
+
+					relink(em->inputs, inputs, NULL);
+					removed[em->id] = true;
+					any_node_removed = true;
+				}
+			}
+		}
+		else if(node->special_type == SHADER_SPECIAL_TYPE_BUMP) {
+			BumpNode *bump = static_cast<BumpNode*>(node);
+
+			if(bump->outputs[0]->links.size()) {
+				/* Height inputs is not connected. */
+				/* TODO(sergey): Ignore bump with zero strength. */
+				if(bump->inputs[0]->link == NULL) {
+					vector<ShaderInput*> inputs = bump->outputs[0]->links;
+					if(bump->inputs[4]->link == NULL) {
+						if(geom == NULL) {
+							geom = new GeometryNode();
+						}
+						relink(bump->inputs, inputs, geom->output("Normal"));
+					}
+					else {
+						relink(bump->inputs, inputs, bump->input("Normal")->link);
+					}
+					removed[bump->id] = true;
+					any_node_removed = true;
+				}
+			}
+		}
 		else if(node->special_type == SHADER_SPECIAL_TYPE_MIX_CLOSURE) {
 			MixClosureNode *mix = static_cast<MixClosureNode*>(node);
 
@@ -383,15 +452,7 @@ void ShaderGraph::remove_unneeded_nodes()
 				ShaderOutput *output = mix->inputs[1]->link;
 				vector<ShaderInput*> inputs = mix->outputs[0]->links;
 
-				foreach(ShaderInput *sock, mix->inputs)
-					if(sock->link)
-						disconnect(sock);
-
-				foreach(ShaderInput *input, inputs) {
-					disconnect(input);
-					if(output)
-						connect(output, input);
-				}
+				relink(mix->inputs, inputs, output);
 				removed[mix->id] = true;
 				any_node_removed = true;
 			}
@@ -404,15 +465,7 @@ void ShaderGraph::remove_unneeded_nodes()
 					ShaderOutput *output = mix->inputs[1]->link;
 					vector<ShaderInput*> inputs = mix->outputs[0]->links;
 
-					foreach(ShaderInput *sock, mix->inputs)
-						if(sock->link)
-							disconnect(sock);
-
-					foreach(ShaderInput *input, inputs) {
-						disconnect(input);
-						if(output)
-							connect(output, input);
-					}
+					relink(mix->inputs, inputs, output);
 					removed[mix->id] = true;
 					any_node_removed = true;
 				}
@@ -420,16 +473,8 @@ void ShaderGraph::remove_unneeded_nodes()
 				else if(mix->inputs[0]->value.x == 1.0f) {
 					ShaderOutput *output = mix->inputs[2]->link;
 					vector<ShaderInput*> inputs = mix->outputs[0]->links;
-					
-					foreach(ShaderInput *sock, mix->inputs)
-						if(sock->link)
-							disconnect(sock);
-
-					foreach(ShaderInput *input, inputs) {
-						disconnect(input);
-						if(output)
-							connect(output, input);
-					}
+
+					relink(mix->inputs, inputs, output);
 					removed[mix->id] = true;
 					any_node_removed = true;
 				}
@@ -446,15 +491,7 @@ void ShaderGraph::remove_unneeded_nodes()
 					ShaderOutput *output = mix->inputs[1]->link;
 					vector<ShaderInput*> inputs = mix->outputs[0]->links;
 
-					foreach(ShaderInput *sock, mix->inputs)
-						if(sock->link)
-							disconnect(sock);
-
-					foreach(ShaderInput *input, inputs) {
-						disconnect(input);
-						if(output)
-							connect(output, input);
-					}
+					relink(mix->inputs, inputs, output);
 					removed[mix->id] = true;
 					any_node_removed = true;
 				}
@@ -463,15 +500,7 @@ void ShaderGraph::remove_unneeded_nodes()
 					ShaderOutput *output = mix->inputs[2]->link;
 					vector<ShaderInput*> inputs = mix->outputs[0]->links;
 
-					foreach(ShaderInput *sock, mix->inputs)
-						if(sock->link)
-							disconnect(sock);
-
-					foreach(ShaderInput *input, inputs) {
-						disconnect(input);
-						if(output)
-							connect(output, input);
-					}
+					relink(mix->inputs, inputs, output);
 					removed[mix->id] = true;
 					any_node_removed = true;
 				}
@@ -492,6 +521,10 @@ void ShaderGraph::remove_unneeded_nodes()
 
 		nodes = newnodes;
 	}
+
+	if(geom != NULL) {
+		add(geom);
+	}
 }
 
 void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack)
@@ -520,7 +553,7 @@ void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<b
 
 void ShaderGraph::clean()
 {
-	/* remove proxy and unnecessary mix nodes */
+	/* remove proxy and unnecessary nodes */
 	remove_unneeded_nodes();
 
 	/* we do two things here: find cycles and break them, and remove unused
@@ -556,7 +589,7 @@ void ShaderGraph::clean()
 		else
 			delete node;
 	}
-	
+
 	nodes = newnodes;
 }
 
@@ -685,7 +718,7 @@ void ShaderGraph::bump_from_displacement()
 	 * different shifted coordinates.
 	 *
 	 * these 3 displacement values are then fed into the bump node, which will
-	 * output the the perturbed normal. */
+	 * output the perturbed normal. */
 
 	ShaderInput *displacement_in = output()->input("Displacement");
 
@@ -738,10 +771,18 @@ void ShaderGraph::bump_from_displacement()
 	/* connect bump output to normal input nodes that aren't set yet. actually
 	 * this will only set the normal input to the geometry node that we created
 	 * and connected to all other normal inputs already. */
-	foreach(ShaderNode *node, nodes)
-		foreach(ShaderInput *input, node->inputs)
+	foreach(ShaderNode *node, nodes) {
+		/* Don't connect normal to the bump node we're coming from,
+		 * otherwise it'll be a cycle in graph.
+		 */
+		if(node == bump) {
+			continue;
+		}
+		foreach(ShaderInput *input, node->inputs) {
 			if(!input->link && input->default_value == ShaderInput::NORMAL)
 				connect(set_normal->output("Normal"), input);
+		}
+	}
 
 	/* for displacement bump, clear the normal input in case the above loop
 	 * connected the setnormal out to the bump normalin */
@@ -836,6 +877,26 @@ void ShaderGraph::transform_multi_closure(ShaderNode *node, ShaderOutput *weight
 	}
 }
 
+int ShaderGraph::get_num_closures()
+{
+	int num_closures = 0;
+	foreach(ShaderNode *node, nodes) {
+		if(node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
+			BsdfNode *bsdf_node = static_cast<BsdfNode*>(node);
+			/* TODO(sergey): Make it more generic approach, maybe some utility
+			 * macros like CLOSURE_IS_FOO()?
+			 */
+			if(CLOSURE_IS_BSSRDF(bsdf_node->closure))
+				num_closures = num_closures + 3;
+			else if(CLOSURE_IS_GLASS(bsdf_node->closure))
+				num_closures = num_closures + 2;
+			else
+				num_closures = num_closures + 1;
+		}
+	}
+	return num_closures;
+}
+
 void ShaderGraph::dump_graph(const char *filename)
 {
 	FILE *fd = fopen(filename, "w");
@@ -845,31 +906,61 @@ void ShaderGraph::dump_graph(const char *filename)
 		return;
 	}
 
-	fprintf(fd, "digraph dependencygraph {\n");
+	fprintf(fd, "digraph shader_graph {\n");
 	fprintf(fd, "ranksep=1.5\n");
+	fprintf(fd, "rankdir=LR\n");
 	fprintf(fd, "splines=false\n");
 
 	foreach(ShaderNode *node, nodes) {
 		fprintf(fd, "// NODE: %p\n", node);
-		fprintf(fd,
-		        "\"%p\" [shape=record,label=\"%s\"]\n",
-		        node,
-		        node->name.c_str());
+		fprintf(fd, "\"%p\" [shape=record,label=\"{", node);
+		if(node->inputs.size()) {
+			fprintf(fd, "{");
+			foreach(ShaderInput *socket, node->inputs) {
+				if(socket != node->inputs[0]) {
+					fprintf(fd, "|");
+				}
+				fprintf(fd, "<IN_%p>%s", socket, socket->name);
+			}
+			fprintf(fd, "}|");
+		}
+		fprintf(fd, "%s", node->name.c_str());
+		if(node->bump == SHADER_BUMP_CENTER) {
+			fprintf(fd, " (bump:center)");
+		}
+		else if(node->bump == SHADER_BUMP_DX) {
+			fprintf(fd, " (bump:dx)");
+		}
+		else if(node->bump == SHADER_BUMP_DY) {
+			fprintf(fd, " (bump:dy)");
+		}
+		if(node->outputs.size()) {
+			fprintf(fd, "|{");
+			foreach(ShaderOutput *socket, node->outputs) {
+				if(socket != node->outputs[0]) {
+					fprintf(fd, "|");
+				}
+				fprintf(fd, "<OUT_%p>%s", socket, socket->name);
+			}
+			fprintf(fd, "}");
+		}
+		fprintf(fd, "}\"]");
 	}
 
 	foreach(ShaderNode *node, nodes) {
 		foreach(ShaderOutput *output, node->outputs) {
 			foreach(ShaderInput *input, output->links) {
 				fprintf(fd,
-				        "// CONNECTION: %p->%p (%s:%s)\n",
+				        "// CONNECTION: OUT_%p->IN_%p (%s:%s)\n",
 				        output,
 				        input,
 				        output->name, input->name);
 				fprintf(fd,
-				        "\"%p\":s -> \"%p\":n [label=\"%s:%s\"]\n",
+				        "\"%p\":\"OUT_%p\":e -> \"%p\":\"IN_%p\":w [label=\"\"]\n",
 				        output->parent,
+				        output,
 				        input->parent,
-				        output->name, input->name);
+				        input);
 			}
 		}
 	}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 29fa1403597..e61c20c416a 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __GRAPH_H__
@@ -80,7 +80,12 @@ enum ShaderNodeSpecialType {
 	SHADER_SPECIAL_TYPE_MIX_RGB, /* Only Mix subtype */
 	SHADER_SPECIAL_TYPE_AUTOCONVERT,
 	SHADER_SPECIAL_TYPE_GEOMETRY,
-	SHADER_SPECIAL_TYPE_SCRIPT
+	SHADER_SPECIAL_TYPE_SCRIPT,
+	SHADER_SPECIAL_TYPE_BACKGROUND,
+	SHADER_SPECIAL_TYPE_IMAGE_SLOT,
+	SHADER_SPECIAL_TYPE_CLOSURE,
+	SHADER_SPECIAL_TYPE_EMISSION,
+	SHADER_SPECIAL_TYPE_BUMP,
 };
 
 /* Enum
@@ -193,9 +198,9 @@ public:
 	virtual bool has_surface_emission() { return false; }
 	virtual bool has_surface_transparent() { return false; }
 	virtual bool has_surface_bssrdf() { return false; }
-	virtual bool has_converter_blackbody() { return false; }
 	virtual bool has_bssrdf_bump() { return false; }
 	virtual bool has_spatial_varying() { return false; }
+	virtual bool has_object_dependency() { return false; }
 
 	vector<ShaderInput*> inputs;
 	vector<ShaderOutput*> outputs;
@@ -205,6 +210,24 @@ public:
 	ShaderBump bump; /* for bump mapping utility */
 	
 	ShaderNodeSpecialType special_type;	/* special node type */
+
+	/* ** Selective nodes compilation ** */
+
+	/* TODO(sergey): More explicitly mention in the function names
+	 * that those functions are for selective compilation only?
+	 */
+
+	/* Nodes are split into several groups, group of level 0 contains
+	 * nodes which are most commonly used, further levels are extension
+	 * of previous one and includes less commonly used nodes.
+	 */
+	virtual int get_group() { return NODE_GROUP_LEVEL_0; }
+
+	/* Node feature are used to disable huge nodes inside the group,
+	 * so it's possible to disable huge nodes inside of the required
+	 * nodes group.
+	 */
+	virtual int get_feature() { return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP; }
 };
 
 
@@ -247,10 +270,13 @@ public:
 
 	void connect(ShaderOutput *from, ShaderInput *to);
 	void disconnect(ShaderInput *to);
+	void relink(vector<ShaderInput*> inputs, vector<ShaderInput*> outputs, ShaderOutput *output);
 
 	void remove_unneeded_nodes();
 	void finalize(bool do_bump = false, bool do_osl = false);
 
+	int get_num_closures();
+
 	void dump_graph(const char *filename);
 
 protected:
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 076cc3d8b63..7bceb8a8bfa 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -151,15 +151,27 @@ bool ImageManager::is_float_image(const string& filename, void *builtin_data, bo
 	return is_float;
 }
 
-static bool image_equals(ImageManager::Image *image, const string& filename, void *builtin_data, InterpolationType interpolation)
+static bool image_equals(ImageManager::Image *image,
+                         const string& filename,
+                         void *builtin_data,
+                         InterpolationType interpolation,
+                         ExtensionType extension)
 {
 	return image->filename == filename &&
 	       image->builtin_data == builtin_data &&
-	       image->interpolation == interpolation;
+	       image->interpolation == interpolation &&
+	       image->extension == extension;
 }
 
-int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, float frame,
-	bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha)
+int ImageManager::add_image(const string& filename,
+                            void *builtin_data,
+                            bool animated,
+                            float frame,
+                            bool& is_float,
+                            bool& is_linear,
+                            InterpolationType interpolation,
+                            ExtensionType extension,
+                            bool use_alpha)
 {
 	Image *img;
 	size_t slot;
@@ -171,11 +183,20 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 		/* find existing image */
 		for(slot = 0; slot < float_images.size(); slot++) {
 			img = float_images[slot];
-			if(img && image_equals(img, filename, builtin_data, interpolation)) {
+			if(img && image_equals(img,
+			                       filename,
+			                       builtin_data,
+			                       interpolation,
+			                       extension))
+			{
 				if(img->frame != frame) {
 					img->frame = frame;
 					img->need_load = true;
 				}
+				if(img->use_alpha != use_alpha) {
+					img->use_alpha = use_alpha;
+					img->need_load = true;
+				}
 				img->users++;
 				return slot;
 			}
@@ -206,6 +227,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 		img->animated = animated;
 		img->frame = frame;
 		img->interpolation = interpolation;
+		img->extension = extension;
 		img->users = 1;
 		img->use_alpha = use_alpha;
 
@@ -214,11 +236,20 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 	else {
 		for(slot = 0; slot < images.size(); slot++) {
 			img = images[slot];
-			if(img && image_equals(img, filename, builtin_data, interpolation)) {
+			if(img && image_equals(img,
+			                       filename,
+			                       builtin_data,
+			                       interpolation,
+			                       extension))
+			{
 				if(img->frame != frame) {
 					img->frame = frame;
 					img->need_load = true;
 				}
+				if(img->use_alpha != use_alpha) {
+					img->use_alpha = use_alpha;
+					img->need_load = true;
+				}
 				img->users++;
 				return slot+tex_image_byte_start;
 			}
@@ -249,6 +280,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 		img->animated = animated;
 		img->frame = frame;
 		img->interpolation = interpolation;
+		img->extension = extension;
 		img->users = 1;
 		img->use_alpha = use_alpha;
 
@@ -292,12 +324,20 @@ void ImageManager::remove_image(int slot)
 	}
 }
 
-void ImageManager::remove_image(const string& filename, void *builtin_data, InterpolationType interpolation)
+void ImageManager::remove_image(const string& filename,
+                                void *builtin_data,
+                                InterpolationType interpolation,
+                                ExtensionType extension)
 {
 	size_t slot;
 
 	for(slot = 0; slot < images.size(); slot++) {
-		if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
+		if(images[slot] && image_equals(images[slot],
+		                                filename,
+		                                builtin_data,
+		                                interpolation,
+		                                extension))
+		{
 			remove_image(slot+tex_image_byte_start);
 			break;
 		}
@@ -306,7 +346,11 @@ void ImageManager::remove_image(const string& filename, void *builtin_data, Inte
 	if(slot == images.size()) {
 		/* see if it's in a float texture slot */
 		for(slot = 0; slot < float_images.size(); slot++) {
-			if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
+			if(float_images[slot] && image_equals(float_images[slot],
+			                                      filename,
+			                                      builtin_data,
+			                                      interpolation,
+			                                      extension)) {
 				remove_image(slot);
 				break;
 			}
@@ -318,12 +362,19 @@ void ImageManager::remove_image(const string& filename, void *builtin_data, Inte
  * without bunch of arguments passing around making code readability even
  * more cluttered.
  */
-void ImageManager::tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation)
+void ImageManager::tag_reload_image(const string& filename,
+                                    void *builtin_data,
+                                    InterpolationType interpolation,
+                                    ExtensionType extension)
 {
 	size_t slot;
 
 	for(slot = 0; slot < images.size(); slot++) {
-		if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
+		if(images[slot] && image_equals(images[slot],
+		                                filename,
+		                                builtin_data,
+		                                interpolation,
+		                                extension)) {
 			images[slot]->need_load = true;
 			break;
 		}
@@ -332,7 +383,11 @@ void ImageManager::tag_reload_image(const string& filename, void *builtin_data,
 	if(slot == images.size()) {
 		/* see if it's in a float texture slot */
 		for(slot = 0; slot < float_images.size(); slot++) {
-			if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
+			if(float_images[slot] && image_equals(float_images[slot],
+			                                      filename,
+			                                      builtin_data,
+			                                      interpolation,
+			                                      extension)) {
 				float_images[slot]->need_load = true;
 				break;
 			}
@@ -399,7 +454,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 			int scanlinesize = width*components*sizeof(uchar);
 
 			in->read_image(TypeDesc::UINT8,
-				(uchar*)pixels + (height-1)*scanlinesize,
+				(uchar*)pixels + (((size_t)height)-1)*scanlinesize,
 				AutoStride,
 				-scanlinesize,
 				AutoStride);
@@ -417,9 +472,10 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		builtin_image_pixels_cb(img->filename, img->builtin_data, pixels);
 	}
 
+	size_t num_pixels = ((size_t)width) * height * depth;
 	if(cmyk) {
 		/* CMYK */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 			pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
 			pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
@@ -428,7 +484,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 2) {
 		/* grayscale + alpha */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -437,7 +493,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 3) {
 		/* RGB */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -446,7 +502,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 1) {
 		/* grayscale */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -455,7 +511,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 
 	if(img->use_alpha == false) {
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 		}
 	}
@@ -521,7 +577,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		vector<float> tmppixels;
 
 		if(components > 4) {
-			tmppixels.resize(width*height*components);
+			tmppixels.resize(((size_t)width)*height*components);
 			readpixels = &tmppixels[0];
 		}
 
@@ -539,7 +595,8 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		}
 
 		if(components > 4) {
-			for(int i = width*height-1; i >= 0; i--) {
+			size_t dimensions = ((size_t)width)*height;
+			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
 				pixels[i*4+3] = tmppixels[i*components+3];
 				pixels[i*4+2] = tmppixels[i*components+2];
 				pixels[i*4+1] = tmppixels[i*components+1];
@@ -558,9 +615,10 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
 	}
 
+	size_t num_pixels = ((size_t)width) * height * depth;
 	if(cmyk) {
 		/* CMYK */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 			pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
@@ -569,7 +627,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 2) {
 		/* grayscale + alpha */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -578,7 +636,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 3) {
 		/* RGB */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -587,7 +645,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 1) {
 		/* grayscale */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -596,7 +654,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 
 	if(img->use_alpha == false) {
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 		}
 	}
@@ -653,7 +711,10 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(), tex_img, img->interpolation, true);
+			device->tex_alloc(name.c_str(),
+			                  tex_img,
+			                  img->interpolation,
+			                  img->extension);
 		}
 	}
 	else {
@@ -685,7 +746,10 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(), tex_img, img->interpolation, true);
+			device->tex_alloc(name.c_str(),
+			                  tex_img,
+			                  img->interpolation,
+			                  img->extension);
 		}
 	}
 
@@ -783,7 +847,36 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 	need_update = false;
 }
 
-void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progress& progess)
+void ImageManager::device_update_slot(Device *device,
+                                      DeviceScene *dscene,
+                                      int slot,
+                                      Progress *progress)
+{
+	Image *image;
+	if(slot >= tex_image_byte_start) {
+		int byte_slot = slot - tex_image_byte_start;
+		assert(images[byte_slot] != NULL);
+		image = images[byte_slot];
+	}
+	else {
+		assert(float_images[slot] != NULL);
+		image = float_images[slot];
+	}
+	if(image->users == 0) {
+		device_free_image(device, dscene, slot);
+	}
+	else if(image->need_load) {
+		if(!osl_texture_system || float_images[slot]->builtin_data)
+			device_load_image(device,
+			                  dscene,
+			                  slot,
+			                  progress);
+	}
+}
+
+void ImageManager::device_pack_images(Device *device,
+                                      DeviceScene *dscene,
+                                      Progress& /*progess*/)
 {
 	/* for OpenCL, we pack all image textures inside a single big texture, and
 	 * will do our own interpolation in the kernel */
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 535f0ff156d..bcc58ae951b 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __IMAGE_H__
@@ -55,14 +55,28 @@ public:
 	ImageManager();
 	~ImageManager();
 
-	int add_image(const string& filename, void *builtin_data, bool animated, float frame,
-		bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha);
+	int add_image(const string& filename,
+	              void *builtin_data,
+	              bool animated,
+	              float frame,
+	              bool& is_float,
+	              bool& is_linear,
+	              InterpolationType interpolation,
+	              ExtensionType extension,
+	              bool use_alpha);
 	void remove_image(int slot);
-	void remove_image(const string& filename, void *builtin_data, InterpolationType interpolation);
-	void tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation);
+	void remove_image(const string& filename,
+	                  void *builtin_data,
+	                  InterpolationType interpolation,
+	                  ExtensionType extension);
+	void tag_reload_image(const string& filename,
+	                      void *builtin_data,
+	                      InterpolationType interpolation,
+	                      ExtensionType extension);
 	bool is_float_image(const string& filename, void *builtin_data, bool& is_linear);
 
 	void device_update(Device *device, DeviceScene *dscene, Progress& progress);
+	void device_update_slot(Device *device, DeviceScene *dscene, int slot, Progress *progress);
 	void device_free(Device *device, DeviceScene *dscene);
 	void device_free_builtin(Device *device, DeviceScene *dscene);
 
@@ -73,9 +87,9 @@ public:
 
 	bool need_update;
 
-	boost::function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
-	boost::function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
-	boost::function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
+	function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
+	function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
+	function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
@@ -86,6 +100,7 @@ public:
 		bool animated;
 		float frame;
 		InterpolationType interpolation;
+		ExtensionType extension;
 
 		int users;
 	};
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 03a8cd5d2d3..9f8d5b50ccd 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -39,7 +39,6 @@ Integrator::Integrator()
 	transparent_max_bounce = max_bounce;
 	transparent_shadows = false;
 
-	volume_homogeneous_sampling = 0;
 	volume_max_steps = 1024;
 	volume_step_size = 0.1f;
 
@@ -60,6 +59,10 @@ Integrator::Integrator()
 	mesh_light_samples = 1;
 	subsurface_samples = 1;
 	volume_samples = 1;
+
+	sample_all_lights_direct = true;
+	sample_all_lights_indirect = true;
+
 	method = PATH;
 
 	sampling_pattern = SAMPLING_PATTERN_SOBOL;
@@ -97,6 +100,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	 * transparent shaders in the scene. Otherwise we can disable it
 	 * to improve performance a bit. */
 	if(transparent_shadows) {
+		kintegrator->transparent_shadows = false;
 		foreach(Shader *shader, scene->shaders) {
 			/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
 			if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
@@ -189,7 +193,6 @@ bool Integrator::modified(const Integrator& integrator)
 		transparent_min_bounce == integrator.transparent_min_bounce &&
 		transparent_max_bounce == integrator.transparent_max_bounce &&
 		transparent_shadows == integrator.transparent_shadows &&
-		volume_homogeneous_sampling == integrator.volume_homogeneous_sampling &&
 		volume_max_steps == integrator.volume_max_steps &&
 		volume_step_size == integrator.volume_step_size &&
 		caustics_reflective == integrator.caustics_reflective &&
@@ -214,7 +217,7 @@ bool Integrator::modified(const Integrator& integrator)
 		sample_all_lights_indirect == integrator.sample_all_lights_indirect);
 }
 
-void Integrator::tag_update(Scene *scene)
+void Integrator::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 13c10e8ca94..fd2ad14488d 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __INTEGRATOR_H__
@@ -39,7 +39,6 @@ public:
 	int transparent_max_bounce;
 	bool transparent_shadows;
 
-	int volume_homogeneous_sampling;
 	int volume_max_steps;
 	float volume_step_size;
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1f006637e67..4e962616263 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "background.h"
@@ -26,6 +26,7 @@
 
 #include "util_foreach.h"
 #include "util_progress.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,6 +83,8 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	device->mem_free(d_input);
 	device->mem_free(d_output);
 
+	d_input.clear();
+
 	float4 *d_output_data = reinterpret_cast<float4*>(d_output.data_pointer);
 
 	pixels.resize(width*height);
@@ -125,6 +128,9 @@ Light::Light()
 
 	shader = 0;
 	samples = 1;
+	max_bounces = 1024;
+
+	is_portal = false;
 }
 
 void Light::tag_update(Scene *scene)
@@ -132,6 +138,17 @@ void Light::tag_update(Scene *scene)
 	scene->light_manager->need_update = true;
 }
 
+bool Light::has_contribution(Scene *scene)
+{
+	if(is_portal) {
+		return false;
+	}
+	if(type == LIGHT_BACKGROUND) {
+		return true;
+	}
+	return scene->shaders[shader]->has_surface_emission;
+}
+
 /* Light Manager */
 
 LightManager::LightManager()
@@ -149,10 +166,17 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	progress.set_status("Updating Lights", "Computing distribution");
 
 	/* count */
-	size_t num_lights = scene->lights.size();
+	size_t num_lights = 0;
 	size_t num_background_lights = 0;
 	size_t num_triangles = 0;
 
+	bool background_mis = false;
+
+	foreach(Light *light, scene->lights) {
+		if(light->has_contribution(scene))
+			num_lights++;
+	}
+
 	foreach(Object *object, scene->objects) {
 		Mesh *mesh = object->mesh;
 		bool have_emission = false;
@@ -283,22 +307,29 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	float trianglearea = totarea;
 
 	/* point lights */
-	float lightarea = (totarea > 0.0f)? totarea/scene->lights.size(): 1.0f;
+	float lightarea = (totarea > 0.0f) ? totarea / num_lights : 1.0f;
 	bool use_lamp_mis = false;
 
-	for(int i = 0; i < scene->lights.size(); i++, offset++) {
-		Light *light = scene->lights[i];
+	int light_index = 0;
+	foreach(Light *light, scene->lights) {
+		if(!light->has_contribution(scene))
+			continue;
 
 		distribution[offset].x = totarea;
-		distribution[offset].y = __int_as_float(~(int)i);
+		distribution[offset].y = __int_as_float(~light_index);
 		distribution[offset].z = 1.0f;
 		distribution[offset].w = light->size;
 		totarea += lightarea;
 
 		if(light->size > 0.0f && light->use_mis)
 			use_lamp_mis = true;
-		if(light->type == LIGHT_BACKGROUND)
+		if(light->type == LIGHT_BACKGROUND) {
 			num_background_lights++;
+			background_mis = light->use_mis;
+		}
+
+		light_index++;
+		offset++;
 	}
 
 	/* normalize cumulative distribution functions */
@@ -360,6 +391,18 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 
 		/* CDF */
 		device->tex_alloc("__light_distribution", dscene->light_distribution);
+
+		/* Portals */
+		if(num_background_lights > 0 && light_index != scene->lights.size()) {
+			kintegrator->portal_offset = light_index;
+			kintegrator->num_portals = scene->lights.size() - light_index;
+			kintegrator->portal_pdf = background_mis? 0.5f: 1.0f;
+		}
+		else {
+			kintegrator->num_portals = 0;
+			kintegrator->portal_offset = 0;
+			kintegrator->portal_pdf = 0.0f;
+		}
 	}
 	else {
 		dscene->light_distribution.clear();
@@ -370,10 +413,53 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		kintegrator->pdf_lights = 0.0f;
 		kintegrator->inv_pdf_lights = 0.0f;
 		kintegrator->use_lamp_mis = false;
+		kintegrator->num_portals = 0;
+		kintegrator->portal_offset = 0;
+		kintegrator->portal_pdf = 0.0f;
+
 		kfilm->pass_shadow_scale = 1.0f;
 	}
 }
 
+static void background_cdf(int start,
+                           int end,
+                           int res,
+                           int cdf_count,
+                           const vector<float3> *pixels,
+                           float2 *cond_cdf)
+{
+	/* Conditional CDFs (rows, U direction). */
+	for(int i = start; i < end; i++) {
+		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
+		float3 env_color = (*pixels)[i * res];
+		float ave_luminance = average(env_color);
+
+		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
+		cond_cdf[i * cdf_count].y = 0.0f;
+
+		for(int j = 1; j < res; j++) {
+			env_color = (*pixels)[i * res + j];
+			ave_luminance = average(env_color);
+
+			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
+			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
+		}
+
+		float cdf_total = cond_cdf[i * cdf_count + res - 1].y + cond_cdf[i * cdf_count + res - 1].x / res;
+		float cdf_total_inv = 1.0f / cdf_total;
+
+		/* stuff the total into the brightness value for the last entry, because
+		 * we are going to normalize the CDFs to 0.0 to 1.0 afterwards */
+		cond_cdf[i * cdf_count + res].x = cdf_total;
+
+		if(cdf_total > 0.0f)
+			for(int j = 1; j < res; j++)
+				cond_cdf[i * cdf_count + j].y *= cdf_total_inv;
+
+		cond_cdf[i * cdf_count + res].y = 1.0f;
+	}
+}
+
 void LightManager::device_update_background(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	KernelIntegrator *kintegrator = &dscene->data.integrator;
@@ -414,34 +500,30 @@ void LightManager::device_update_background(Device *device, DeviceScene *dscene,
 	float2 *marg_cdf = dscene->light_background_marginal_cdf.resize(cdf_count);
 	float2 *cond_cdf = dscene->light_background_conditional_cdf.resize(cdf_count * cdf_count);
 
-	/* conditional CDFs (rows, U direction) */
-	for(int i = 0; i < res; i++) {
-		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
-		float3 env_color = pixels[i * res];
-		float ave_luminamce = average(env_color);
-
-		cond_cdf[i * cdf_count].x = ave_luminamce * sin_theta;
-		cond_cdf[i * cdf_count].y = 0.0f;
-
-		for(int j = 1; j < res; j++) {
-			env_color = pixels[i * res + j];
-			ave_luminamce = average(env_color);
-
-			cond_cdf[i * cdf_count + j].x = ave_luminamce * sin_theta;
-			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
+	double time_start = time_dt();
+	if(res < 512) {
+		/* Small enough resolution, faster to do single-threaded. */
+		background_cdf(0, res, res, cdf_count, &pixels, cond_cdf);
+	}
+	else {
+		/* Threaded evaluation for large resolution. */
+		const int num_blocks = TaskScheduler::num_threads();
+		const int chunk_size = res / num_blocks;
+		int start_row = 0;
+		TaskPool pool;
+		for(int i = 0; i < num_blocks; ++i) {
+			const int current_chunk_size =
+			    (i != num_blocks - 1) ? chunk_size
+			                          : (res - i * chunk_size);
+			pool.push(function_bind(&background_cdf,
+			                        start_row, start_row + current_chunk_size,
+			                        res,
+			                        cdf_count,
+			                        &pixels,
+			                        cond_cdf));
+			start_row += current_chunk_size;
 		}
-
-		float cdf_total = cond_cdf[i * cdf_count + res - 1].y + cond_cdf[i * cdf_count + res - 1].x / res;
-
-		/* stuff the total into the brightness value for the last entry, because
-		 * we are going to normalize the CDFs to 0.0 to 1.0 afterwards */
-		cond_cdf[i * cdf_count + res].x = cdf_total;
-
-		if(cdf_total > 0.0f)
-			for(int j = 1; j < res; j++)
-				cond_cdf[i * cdf_count + j].y /= cdf_total;
-
-		cond_cdf[i * cdf_count + res].y = 1.0f;
+		pool.wait_work();
 	}
 
 	/* marginal CDFs (column, V direction, sum of rows) */
@@ -462,6 +544,8 @@ void LightManager::device_update_background(Device *device, DeviceScene *dscene,
 
 	marg_cdf[res].y = 1.0f;
 
+	VLOG(2) << "Background MIS build time " << time_dt() - time_start << "\n";
+
 	/* update device */
 	device->tex_alloc("__light_background_marginal_cdf", dscene->light_background_marginal_cdf);
 	device->tex_alloc("__light_background_conditional_cdf", dscene->light_background_conditional_cdf);
@@ -472,10 +556,8 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 	if(scene->lights.size() == 0)
 		return;
 
-	float4 *light_data = dscene->light_data.resize(scene->lights.size()*LIGHT_SIZE);
-
-	if(!device->info.advanced_shading) {
-		/* remove unsupported light */
+	/* remove background light? */
+	if(!(device->info.advanced_shading)) {
 		foreach(Light *light, scene->lights) {
 			if(light->type == LIGHT_BACKGROUND) {
 				scene->lights.erase(std::remove(scene->lights.begin(), scene->lights.end(), light), scene->lights.end());
@@ -484,11 +566,18 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 		}
 	}
 
-	for(size_t i = 0; i < scene->lights.size(); i++) {
-		Light *light = scene->lights[i];
+	float4 *light_data = dscene->light_data.resize(scene->lights.size()*LIGHT_SIZE);
+	int light_index = 0;
+
+	foreach(Light *light, scene->lights) {
+		if(!light->has_contribution(scene)) {
+			continue;
+		}
+
 		float3 co = light->co;
-		int shader_id = scene->shader_manager->get_shader_id(scene->lights[i]->shader);
+		int shader_id = scene->shader_manager->get_shader_id(light->shader);
 		float samples = __int_as_float(light->samples);
+		float max_bounces = __int_as_float(light->max_bounces);
 
 		if(!light->cast_shadow)
 			shader_id &= ~SHADER_CAST_SHADOW;
@@ -519,10 +608,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_DISTANT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -533,17 +623,17 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float area = M_PI_F*radius*radius;
 			float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
 			float3 dir = light->dir;
-			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_BACKGROUND) {
 			uint visibility = scene->background->visibility;
@@ -568,10 +658,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 				use_light_visibility = true;
 			}
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_AREA) {
 			float3 axisu = light->axisu*(light->sizeu*light->size);
@@ -580,16 +671,16 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
 			float3 dir = light->dir;
 			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_SPOT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -600,24 +691,57 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float spot_smooth = (1.0f - spot_angle)*light->spot_smooth;
 			float3 dir = light->dir;
 			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
+
+		light_index++;
 	}
-	
+
+	/* TODO(sergey): Consider moving portals update to their own function
+	 * keeping this one more manageable.
+	 */
+	foreach(Light *light, scene->lights) {
+		if(!light->is_portal)
+			continue;
+		assert(light->type == LIGHT_AREA);
+
+		float3 co = light->co;
+		float3 axisu = light->axisu*(light->sizeu*light->size);
+		float3 axisv = light->axisv*(light->sizev*light->size);
+		float area = len(axisu)*len(axisv);
+		float invarea = (area > 0.0f) ? 1.0f / area : 1.0f;
+		float3 dir = light->dir;
+
+		dir = safe_normalize(dir);
+
+		light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+		light_data[light_index*LIGHT_SIZE + 1] = make_float4(area, axisu.x, axisu.y, axisu.z);
+		light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
+		light_data[light_index*LIGHT_SIZE + 3] = make_float4(-1, dir.x, dir.y, dir.z);
+		light_data[light_index*LIGHT_SIZE + 4] = make_float4(-1, 0.0f, 0.0f, 0.0f);
+
+		light_index++;
+	}
+
+	VLOG(1) << "Number of lights without contribution: "
+	        << scene->lights.size() - light_index;
+
 	device->tex_alloc("__light_data", dscene->light_data);
 }
 
 void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->lights.size() << " lights.";
+
 	if(!need_update)
 		return;
 
@@ -655,7 +779,7 @@ void LightManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->light_background_conditional_cdf.clear();
 }
 
-void LightManager::tag_update(Scene *scene)
+void LightManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 89091bb5f9e..afec3628dda 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __LIGHT_H__
@@ -56,10 +56,16 @@ public:
 	bool use_transmission;
 	bool use_scatter;
 
+	bool is_portal;
+
 	int shader;
 	int samples;
+	int max_bounces;
 
 	void tag_update(Scene *scene);
+
+	/* Check whether the light has contribution the the scene. */
+	bool has_contribution(Scene *scene);
 };
 
 class LightManager {
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 5602609c030..57f194651f8 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "bvh.h"
@@ -20,9 +20,11 @@
 #include "camera.h"
 #include "curves.h"
 #include "device.h"
+#include "graph.h"
 #include "shader.h"
 #include "light.h"
 #include "mesh.h"
+#include "nodes.h"
 #include "object.h"
 #include "scene.h"
 
@@ -30,6 +32,7 @@
 
 #include "util_cache.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_progress.h"
 #include "util_set.h"
 
@@ -93,6 +96,8 @@ Mesh::Mesh()
 
 	attributes.triangle_mesh = this;
 	curve_attributes.curve_mesh = this;
+
+	has_volume = false;
 }
 
 Mesh::~Mesh()
@@ -132,7 +137,7 @@ void Mesh::clear()
 	transform_applied = false;
 	transform_negative_scaled = false;
 	transform_normal = transform_identity();
-	geometry_synced = false;
+	geometry_flags = GEOMETRY_NONE;
 }
 
 int Mesh::split_vertex(int vertex)
@@ -207,11 +212,11 @@ void Mesh::compute_bounds()
 			bnds.grow(float4_to_float3(curve_keys[i]), curve_keys[i].w);
 
 		Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-		if (use_motion_blur && attr) {
+		if(use_motion_blur && attr) {
 			size_t steps_size = verts.size() * (motion_steps - 1);
 			float3 *vert_steps = attr->data_float3();
 	
-			for (size_t i = 0; i < steps_size; i++)
+			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(vert_steps[i]);
 		}
 
@@ -220,7 +225,7 @@ void Mesh::compute_bounds()
 			size_t steps_size = curve_keys.size() * (motion_steps - 1);
 			float3 *key_steps = curve_attr->data_float3();
 	
-			for (size_t i = 0; i < steps_size; i++)
+			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(key_steps[i]);
 		}
 
@@ -234,19 +239,19 @@ void Mesh::compute_bounds()
 			for(size_t i = 0; i < curve_keys_size; i++)
 				bnds.grow_safe(float4_to_float3(curve_keys[i]), curve_keys[i].w);
 			
-			if (use_motion_blur && attr) {
+			if(use_motion_blur && attr) {
 				size_t steps_size = verts.size() * (motion_steps - 1);
 				float3 *vert_steps = attr->data_float3();
 		
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(vert_steps[i]);
 			}
 
-			if (use_motion_blur && curve_attr) {
+			if(use_motion_blur && curve_attr) {
 				size_t steps_size = curve_keys.size() * (motion_steps - 1);
 				float3 *key_steps = curve_attr->data_float3();
 		
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(key_steps[i]);
 			}
 		}
@@ -508,7 +513,6 @@ void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total
 			progress->set_status(msg, "Building BVH");
 
 			BVHParams bparams;
-			bparams.use_cache = params->use_bvh_cache;
 			bparams.use_spatial_split = params->use_bvh_spatial_split;
 			bparams.use_qbvh = params->use_qbvh;
 
@@ -553,6 +557,7 @@ MeshManager::MeshManager()
 {
 	bvh = NULL;
 	need_update = true;
+	need_flags_update = true;
 }
 
 MeshManager::~MeshManager()
@@ -649,6 +654,10 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 			}
 		}
 	}
+#else
+	(void)device;
+	(void)scene;
+	(void)mesh_attributes;
 #endif
 }
 
@@ -746,8 +755,49 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 	device->tex_alloc("__attributes_map", dscene->attributes_map);
 }
 
-static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_float, vector<float4>& attr_float3, vector<uchar4>& attr_uchar4,
-	Attribute *mattr, TypeDesc& type, int& offset, AttributeElement& element)
+static void update_attribute_element_size(Mesh *mesh,
+                                          Attribute *mattr,
+                                          size_t *attr_float_size,
+                                          size_t *attr_float3_size,
+                                          size_t *attr_uchar4_size)
+{
+	if(mattr) {
+		size_t size = mattr->element_size(
+			mesh->verts.size(),
+			mesh->triangles.size(),
+			mesh->motion_steps,
+			mesh->curves.size(),
+			mesh->curve_keys.size());
+
+		if(mattr->element == ATTR_ELEMENT_VOXEL) {
+			/* pass */
+		}
+		else if(mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+			*attr_uchar4_size += size;
+		}
+		else if(mattr->type == TypeDesc::TypeFloat) {
+			*attr_float_size += size;
+		}
+		else if(mattr->type == TypeDesc::TypeMatrix) {
+			*attr_float3_size += size * 4;
+		}
+		else {
+			*attr_float3_size += size;
+		}
+	}
+}
+
+static void update_attribute_element_offset(Mesh *mesh,
+                                            vector<float>& attr_float,
+                                            size_t& attr_float_offset,
+                                            vector<float4>& attr_float3,
+                                            size_t& attr_float3_offset,
+                                            vector<uchar4>& attr_uchar4,
+                                            size_t& attr_uchar4_offset,
+                                            Attribute *mattr,
+                                            TypeDesc& type,
+                                            int& offset,
+                                            AttributeElement& element)
 {
 	if(mattr) {
 		/* store element and type */
@@ -769,39 +819,43 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
 		}
 		else if(mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
 			uchar4 *data = mattr->data_uchar4();
-			offset = attr_uchar4.size();
-
-			attr_uchar4.resize(attr_uchar4.size() + size);
+			offset = attr_uchar4_offset;
 
-			for(size_t k = 0; k < size; k++)
+			assert(attr_uchar4.capacity() >= offset + size);
+			for(size_t k = 0; k < size; k++) {
 				attr_uchar4[offset+k] = data[k];
+			}
+			attr_uchar4_offset += size;
 		}
 		else if(mattr->type == TypeDesc::TypeFloat) {
 			float *data = mattr->data_float();
-			offset = attr_float.size();
+			offset = attr_float_offset;
 
-			attr_float.resize(attr_float.size() + size);
-
-			for(size_t k = 0; k < size; k++)
+			assert(attr_float.capacity() >= offset + size);
+			for(size_t k = 0; k < size; k++) {
 				attr_float[offset+k] = data[k];
+			}
+			attr_float_offset += size;
 		}
 		else if(mattr->type == TypeDesc::TypeMatrix) {
 			Transform *tfm = mattr->data_transform();
-			offset = attr_float3.size();
-
-			attr_float3.resize(attr_float3.size() + size*4);
+			offset = attr_float3_offset;
 
-			for(size_t k = 0; k < size*4; k++)
+			assert(attr_float3.capacity() >= offset + size * 4);
+			for(size_t k = 0; k < size*4; k++) {
 				attr_float3[offset+k] = (&tfm->x)[k];
+			}
+			attr_float3_offset += size * 4;
 		}
 		else {
 			float4 *data = mattr->data_float4();
-			offset = attr_float3.size();
-
-			attr_float3.resize(attr_float3.size() + size);
+			offset = attr_float3_offset;
 
-			for(size_t k = 0; k < size; k++)
+			assert(attr_float3.capacity() >= offset + size);
+			for(size_t k = 0; k < size; k++) {
 				attr_float3[offset+k] = data[k];
+			}
+			attr_float3_offset += size;
 		}
 
 		/* mesh vertex/curve index is global, not per object, so we sneak
@@ -851,16 +905,16 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	/* mesh attribute are stored in a single array per data type. here we fill
 	 * those arrays, and set the offset and element type to create attribute
 	 * maps next */
-	vector<float> attr_float;
-	vector<float4> attr_float3;
-	vector<uchar4> attr_uchar4;
 
+	/* Pre-allocate attributes to avoid arrays re-allocation which would
+	 * take 2x of overall attribute memory usage.
+	 */
+	size_t attr_float_size = 0;
+	size_t attr_float3_size = 0;
+	size_t attr_uchar4_size = 0;
 	for(size_t i = 0; i < scene->meshes.size(); i++) {
 		Mesh *mesh = scene->meshes[i];
 		AttributeRequestSet& attributes = mesh_attributes[i];
-
-		/* todo: we now store std and name attributes from requests even if
-		 * they actually refer to the same mesh attributes, optimize */
 		foreach(AttributeRequest& req, attributes.requests) {
 			Attribute *triangle_mattr = mesh->attributes.find(req);
 			Attribute *curve_mattr = mesh->curve_attributes.find(req);
@@ -874,12 +928,56 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 					memcpy(triangle_mattr->data_float3(), &mesh->verts[0], sizeof(float3)*mesh->verts.size());
 			}
 
-			update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, triangle_mattr,
-				req.triangle_type, req.triangle_offset, req.triangle_element);
+			update_attribute_element_size(mesh,
+			                              triangle_mattr,
+			                              &attr_float_size,
+			                              &attr_float3_size,
+			                              &attr_uchar4_size);
+			update_attribute_element_size(mesh,
+			                              curve_mattr,
+			                              &attr_float_size,
+			                              &attr_float3_size,
+			                              &attr_uchar4_size);
+		}
+	}
+
+	vector<float> attr_float(attr_float_size);
+	vector<float4> attr_float3(attr_float3_size);
+	vector<uchar4> attr_uchar4(attr_uchar4_size);
+
+	size_t attr_float_offset = 0;
+	size_t attr_float3_offset = 0;
+	size_t attr_uchar4_offset = 0;
+
+	/* Fill in attributes. */
+	for(size_t i = 0; i < scene->meshes.size(); i++) {
+		Mesh *mesh = scene->meshes[i];
+		AttributeRequestSet& attributes = mesh_attributes[i];
+
+		/* todo: we now store std and name attributes from requests even if
+		 * they actually refer to the same mesh attributes, optimize */
+		foreach(AttributeRequest& req, attributes.requests) {
+			Attribute *triangle_mattr = mesh->attributes.find(req);
+			Attribute *curve_mattr = mesh->curve_attributes.find(req);
+
+			update_attribute_element_offset(mesh,
+			                                attr_float, attr_float_offset,
+			                                attr_float3, attr_float3_offset,
+			                                attr_uchar4, attr_uchar4_offset,
+			                                triangle_mattr,
+			                                req.triangle_type,
+			                                req.triangle_offset,
+			                                req.triangle_element);
+
+			update_attribute_element_offset(mesh,
+			                                attr_float, attr_float_offset,
+			                                attr_float3, attr_float3_offset,
+			                                attr_uchar4, attr_uchar4_offset,
+			                                curve_mattr,
+			                                req.curve_type,
+			                                req.curve_offset,
+			                                req.curve_element);
 
-			update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, curve_mattr,
-				req.curve_type, req.curve_offset, req.curve_element);
-	
 			if(progress.get_cancel()) return;
 		}
 	}
@@ -978,11 +1076,13 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	/* bvh build */
 	progress.set_status("Updating Scene BVH", "Building");
 
+	VLOG(1) << (scene->params.use_qbvh ? "Using QBVH optimization structure"
+	                                   : "Using regular BVH optimization structure");
+
 	BVHParams bparams;
 	bparams.top_level = true;
 	bparams.use_qbvh = scene->params.use_qbvh;
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
-	bparams.use_cache = scene->params.use_bvh_cache;
 
 	delete bvh;
 	bvh = BVH::create(bparams, scene->objects);
@@ -999,6 +1099,10 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->bvh_nodes.reference((float4*)&pack.nodes[0], pack.nodes.size());
 		device->tex_alloc("__bvh_nodes", dscene->bvh_nodes);
 	}
+	if(pack.leaf_nodes.size()) {
+		dscene->bvh_leaf_nodes.reference((float4*)&pack.leaf_nodes[0], pack.leaf_nodes.size());
+		device->tex_alloc("__bvh_leaf_nodes", dscene->bvh_leaf_nodes);
+	}
 	if(pack.object_node.size()) {
 		dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size());
 		device->tex_alloc("__object_node", dscene->object_node);
@@ -1025,18 +1129,91 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	}
 
 	dscene->data.bvh.root = pack.root_index;
+	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+}
+
+void MeshManager::device_update_flags(Device * /*device*/,
+                                      DeviceScene * /*dscene*/,
+                                      Scene * scene,
+                                      Progress& /*progress*/)
+{
+	if(!need_update && !need_flags_update) {
+		return;
+	}
+	/* update flags */
+	foreach(Mesh *mesh, scene->meshes) {
+		mesh->has_volume = false;
+		foreach(uint shader, mesh->used_shaders) {
+			if(scene->shaders[shader]->has_volume) {
+				mesh->has_volume = true;
+			}
+		}
+	}
+	need_flags_update = false;
+}
+
+void MeshManager::device_update_displacement_images(Device *device,
+                                                    DeviceScene *dscene,
+                                                    Scene *scene,
+                                                    Progress& progress)
+{
+	progress.set_status("Updating Displacement Images");
+	TaskPool pool;
+	ImageManager *image_manager = scene->image_manager;
+	set<int> bump_images;
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update) {
+			foreach(uint shader_index, mesh->used_shaders) {
+				Shader *shader = scene->shaders[shader_index];
+				if(shader->graph_bump == NULL) {
+					continue;
+				}
+				foreach(ShaderNode* node, shader->graph_bump->nodes) {
+					if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
+						continue;
+					}
+					if(device->info.pack_images) {
+						/* If device requires packed images we need to update all
+						 * images now, even if they're not used for displacement.
+						 */
+						image_manager->device_update(device,
+						                             dscene,
+						                             progress);
+						return;
+					}
+					ImageSlotNode *image_node = static_cast<ImageSlotNode*>(node);
+					int slot = image_node->slot;
+					if(slot != -1) {
+						bump_images.insert(slot);
+					}
+				}
+			}
+		}
+	}
+	foreach(int slot, bump_images) {
+		pool.push(function_bind(&ImageManager::device_update_slot,
+		                        image_manager,
+		                        device,
+		                        dscene,
+		                        slot,
+		                        &progress));
+	}
+	pool.wait_work();
 }
 
 void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
+
 	if(!need_update)
 		return;
 
 	/* update normals */
 	foreach(Mesh *mesh, scene->meshes) {
-		foreach(uint shader, mesh->used_shaders)
+		foreach(uint shader, mesh->used_shaders) {
 			if(scene->shaders[shader]->need_update_attributes)
 				mesh->need_update = true;
+		}
 
 		if(mesh->need_update) {
 			mesh->add_face_normals();
@@ -1046,6 +1223,28 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 		}
 	}
 
+	/* Update images needed for true displacement. */
+	bool need_displacement_images = false;
+	bool old_need_object_flags_update = false;
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update &&
+		   mesh->displacement_method != Mesh::DISPLACE_BUMP)
+		{
+			need_displacement_images = true;
+			break;
+		}
+	}
+	if(need_displacement_images) {
+		VLOG(1) << "Updating images used for true displacement.";
+		device_update_displacement_images(device, dscene, scene, progress);
+		old_need_object_flags_update = scene->object_manager->need_flags_update;
+		scene->object_manager->device_update_flags(device,
+		                                           dscene,
+		                                           scene,
+		                                           progress,
+		                                           false);
+	}
+
 	/* device update */
 	device_free(device, dscene);
 
@@ -1087,13 +1286,19 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update) {
-			pool.push(function_bind(&Mesh::compute_bvh, mesh, &scene->params, &progress, i, num_bvh));
-			i++;
+			pool.push(function_bind(&Mesh::compute_bvh,
+			                        mesh,
+			                        &scene->params,
+			                        &progress,
+			                        i,
+			                        num_bvh));
+			if(!mesh->transform_applied) {
+				i++;
+			}
 		}
 	}
 
 	pool.wait_work();
-	
 	foreach(Shader *shader, scene->shaders)
 		shader->need_update_attributes = false;
 
@@ -1104,6 +1309,8 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	bool motion_blur = false;
 #endif
 
+	/* update obejcts */
+	vector<Object *> volume_objects;
 	foreach(Object *object, scene->objects)
 		object->compute_bounds(motion_blur);
 
@@ -1112,11 +1319,22 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	device_update_bvh(device, dscene, scene, progress);
 
 	need_update = false;
+
+	if(need_displacement_images) {
+		/* Re-tag flags for update, so they're re-evaluated
+		 * for meshes with correct bounding boxes.
+		 *
+		 * This wouldn't cause wrong results, just true
+		 * displacement might be less optimal ot calculate.
+		 */
+		scene->object_manager->need_flags_update = old_need_object_flags_update;
+	}
 }
 
 void MeshManager::device_free(Device *device, DeviceScene *dscene)
 {
 	device->tex_free(dscene->bvh_nodes);
+	device->tex_free(dscene->bvh_leaf_nodes);
 	device->tex_free(dscene->object_node);
 	device->tex_free(dscene->tri_woop);
 	device->tex_free(dscene->prim_type);
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 28cee5745ea..76c186a3feb 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __MESH_H__
@@ -71,14 +71,21 @@ public:
 	ustring name;
 
 	/* Mesh Data */
-	bool geometry_synced;  /* used to distinguish meshes with no verts
-	                          and meshed for which geometry is not created */
+	enum GeometryFlags {
+		GEOMETRY_NONE      = 0,
+		GEOMETRY_TRIANGLES = (1 << 0),
+		GEOMETRY_CURVES    = (1 << 1),
+	};
+	int geometry_flags;  /* used to distinguish meshes with no verts
+	                        and meshed for which geometry is not created */
 
 	vector<float3> verts;
 	vector<Triangle> triangles;
 	vector<uint> shader;
 	vector<bool> smooth;
 
+	bool has_volume;  /* Set in the device_update_flags(). */
+
 	vector<float4> curve_keys; /* co + radius */
 	vector<Curve> curves;
 
@@ -143,6 +150,7 @@ public:
 	BVH *bvh;
 
 	bool need_update;
+	bool need_flags_update;
 
 	MeshManager();
 	~MeshManager();
@@ -158,6 +166,8 @@ public:
 	void device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_attributes(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+	void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+	void device_update_displacement_images(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 4c0ee76299c..1ba0c7f7291 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index e8476bfac4c..7ed07ab6453 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -11,12 +11,13 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "image.h"
 #include "nodes.h"
 #include "svm.h"
+#include "svm_math_util.h"
 #include "osl.h"
 #include "sky_model.h"
 
@@ -173,8 +174,10 @@ static ShaderEnum image_projection_init()
 {
 	ShaderEnum enm;
 
-	enm.insert("Flat", 0);
-	enm.insert("Box", 1);
+	enm.insert("Flat", NODE_IMAGE_PROJ_FLAT);
+	enm.insert("Box", NODE_IMAGE_PROJ_BOX);
+	enm.insert("Sphere", NODE_IMAGE_PROJ_SPHERE);
+	enm.insert("Tube", NODE_IMAGE_PROJ_TUBE);
 
 	return enm;
 }
@@ -183,7 +186,7 @@ ShaderEnum ImageTextureNode::color_space_enum = color_space_init();
 ShaderEnum ImageTextureNode::projection_enum = image_projection_init();
 
 ImageTextureNode::ImageTextureNode()
-: TextureNode("image_texture")
+: ImageSlotTextureNode("image_texture")
 {
 	image_manager = NULL;
 	slot = -1;
@@ -195,6 +198,7 @@ ImageTextureNode::ImageTextureNode()
 	color_space = ustring("Color");
 	projection = ustring("Flat");
 	interpolation = INTERPOLATION_LINEAR;
+	extension = EXTENSION_REPEAT;
 	projection_blend = 0.0f;
 	animated = false;
 
@@ -205,8 +209,12 @@ ImageTextureNode::ImageTextureNode()
 
 ImageTextureNode::~ImageTextureNode()
 {
-	if(image_manager)
-		image_manager->remove_image(filename, builtin_data, interpolation);
+	if(image_manager) {
+		image_manager->remove_image(filename,
+		                            builtin_data,
+		                            interpolation,
+		                            extension);
+	}
 }
 
 ShaderNode *ImageTextureNode::clone() const
@@ -224,7 +232,7 @@ void ImageTextureNode::attributes(Shader *shader, AttributeRequestSet *attribute
 #ifdef WITH_PTEX
 	/* todo: avoid loading other texture coordinates when using ptex,
 	 * and hide texture coordinate socket in the UI */
-	if (shader->has_surface && string_endswith(filename, ".ptx")) {
+	if(shader->has_surface && string_endswith(filename, ".ptx")) {
 		/* ptex */
 		attributes->add(ATTR_STD_PTEX_FACE_ID);
 		attributes->add(ATTR_STD_PTEX_UV);
@@ -243,9 +251,15 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
 		bool is_float_bool;
-		slot = image_manager->add_image(filename, builtin_data,
-		                                animated, 0, is_float_bool, is_linear,
-		                                interpolation, use_alpha);
+		slot = image_manager->add_image(filename,
+		                                builtin_data,
+		                                animated,
+		                                0,
+		                                is_float_bool,
+		                                is_linear,
+		                                interpolation,
+		                                extension,
+		                                use_alpha);
 		is_float = (int)is_float_bool;
 	}
 
@@ -265,14 +279,15 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
 			tex_mapping.compile(compiler, vector_in->stack_offset, vector_offset);
 		}
 
-		if(projection == "Flat") {
+		if(projection != "Box") {
 			compiler.add_node(NODE_TEX_IMAGE,
 				slot,
 				compiler.encode_uchar4(
 					vector_offset,
 					color_out->stack_offset,
 					alpha_out->stack_offset,
-					srgb));
+					srgb),
+				projection_enum[projection]);
 		}
 		else {
 			compiler.add_node(NODE_TEX_IMAGE_BOX,
@@ -284,7 +299,7 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
 					srgb),
 				__float_as_int(projection_blend));
 		}
-	
+
 		if(vector_offset != vector_in->stack_offset)
 			compiler.stack_clear_offset(vector_in->type, vector_offset);
 	}
@@ -314,9 +329,15 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 		}
 		else {
 			bool is_float_bool;
-			slot = image_manager->add_image(filename, builtin_data,
-			                                animated, 0, is_float_bool, is_linear,
-			                                interpolation, use_alpha);
+			slot = image_manager->add_image(filename,
+			                                builtin_data,
+			                                animated,
+			                                0,
+			                                is_float_bool,
+			                                is_linear,
+			                                interpolation,
+			                                extension,
+			                                use_alpha);
 			is_float = (int)is_float_bool;
 		}
 	}
@@ -357,6 +378,20 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 			compiler.parameter("interpolation", "linear");
 			break;
 	}
+
+	switch(extension) {
+		case EXTENSION_EXTEND:
+			compiler.parameter("wrap", "clamp");
+			break;
+		case EXTENSION_CLIP:
+			compiler.parameter("wrap", "black");
+			break;
+		case EXTENSION_REPEAT:
+		default:
+			compiler.parameter("wrap", "periodic");
+			break;
+	}
+
 	compiler.add(this, "node_image_texture");
 }
 
@@ -376,7 +411,7 @@ ShaderEnum EnvironmentTextureNode::color_space_enum = color_space_init();
 ShaderEnum EnvironmentTextureNode::projection_enum = env_projection_init();
 
 EnvironmentTextureNode::EnvironmentTextureNode()
-: TextureNode("environment_texture")
+: ImageSlotTextureNode("environment_texture")
 {
 	image_manager = NULL;
 	slot = -1;
@@ -396,8 +431,12 @@ EnvironmentTextureNode::EnvironmentTextureNode()
 
 EnvironmentTextureNode::~EnvironmentTextureNode()
 {
-	if(image_manager)
-		image_manager->remove_image(filename, builtin_data, INTERPOLATION_LINEAR);
+	if(image_manager) {
+		image_manager->remove_image(filename,
+		                            builtin_data,
+		                            INTERPOLATION_LINEAR,
+		                            EXTENSION_REPEAT);
+	}
 }
 
 ShaderNode *EnvironmentTextureNode::clone() const
@@ -413,7 +452,7 @@ ShaderNode *EnvironmentTextureNode::clone() const
 void EnvironmentTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 #ifdef WITH_PTEX
-	if (shader->has_surface && string_endswith(filename, ".ptx")) {
+	if(shader->has_surface && string_endswith(filename, ".ptx")) {
 		/* ptex */
 		attributes->add(ATTR_STD_PTEX_FACE_ID);
 		attributes->add(ATTR_STD_PTEX_UV);
@@ -432,9 +471,15 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(slot == -1) {
 		bool is_float_bool;
-		slot = image_manager->add_image(filename, builtin_data,
-		                                animated, 0, is_float_bool, is_linear,
-		                                INTERPOLATION_LINEAR, use_alpha);
+		slot = image_manager->add_image(filename,
+		                                builtin_data,
+		                                animated,
+		                                0,
+		                                is_float_bool,
+		                                is_linear,
+		                                INTERPOLATION_LINEAR,
+		                                EXTENSION_REPEAT,
+		                                use_alpha);
 		is_float = (int)is_float_bool;
 	}
 
@@ -495,9 +540,15 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
 		}
 		else {
 			bool is_float_bool;
-			slot = image_manager->add_image(filename, builtin_data,
-			                                animated, 0, is_float_bool, is_linear,
-			                                INTERPOLATION_LINEAR, use_alpha);
+			slot = image_manager->add_image(filename,
+			                                builtin_data,
+			                                animated,
+			                                0,
+			                                is_float_bool,
+			                                is_linear,
+			                                INTERPOLATION_LINEAR,
+			                                EXTENSION_REPEAT,
+			                                use_alpha);
 			is_float = (int)is_float_bool;
 		}
 	}
@@ -628,7 +679,7 @@ static void sky_texture_precompute_new(SunSky *sunsky, float3 dir, float turbidi
 	sky_state = arhosek_xyz_skymodelstate_alloc_init(turbidity, ground_albedo, solarElevation);
 
 	/* Copy values from sky_state to SunSky */
-	for (int i = 0; i < 9; ++i) {
+	for(int i = 0; i < 9; ++i) {
 		sunsky->config_x[i] = (float)sky_state->configs[0][i];
 		sunsky->config_y[i] = (float)sky_state->configs[1][i];
 		sunsky->config_z[i] = (float)sky_state->configs[2][i];
@@ -1293,6 +1344,163 @@ void BrickTextureNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_brick_texture");
 }
 
+/* Point Density Texture */
+
+static ShaderEnum point_density_space_init()
+{
+	ShaderEnum enm;
+
+	enm.insert("Object", NODE_TEX_VOXEL_SPACE_OBJECT);
+	enm.insert("World", NODE_TEX_VOXEL_SPACE_WORLD);
+
+	return enm;
+}
+
+ShaderEnum PointDensityTextureNode::space_enum = point_density_space_init();
+
+PointDensityTextureNode::PointDensityTextureNode()
+: ShaderNode("point_density")
+{
+	image_manager = NULL;
+	slot = -1;
+	filename = "";
+	space = ustring("Object");
+	builtin_data = NULL;
+	interpolation = INTERPOLATION_LINEAR;
+
+	tfm = transform_identity();
+
+	add_input("Vector", SHADER_SOCKET_POINT, ShaderInput::POSITION);
+	add_output("Density", SHADER_SOCKET_FLOAT);
+	add_output("Color", SHADER_SOCKET_COLOR);
+}
+
+PointDensityTextureNode::~PointDensityTextureNode()
+{
+	if(image_manager) {
+		image_manager->remove_image(filename,
+		                            builtin_data,
+		                            interpolation,
+		                            EXTENSION_REPEAT);
+	}
+}
+
+ShaderNode *PointDensityTextureNode::clone() const
+{
+	PointDensityTextureNode *node = new PointDensityTextureNode(*this);
+	node->image_manager = NULL;
+	node->slot = -1;
+	return node;
+}
+
+void PointDensityTextureNode::attributes(Shader *shader,
+                                         AttributeRequestSet *attributes)
+{
+	if(shader->has_volume)
+		attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void PointDensityTextureNode::compile(SVMCompiler& compiler)
+{
+	ShaderInput *vector_in = input("Vector");
+	ShaderOutput *density_out = output("Density");
+	ShaderOutput *color_out = output("Color");
+
+	const bool use_density = !density_out->links.empty();
+	const bool use_color = !color_out->links.empty();
+
+	image_manager = compiler.image_manager;
+
+	if (use_density || use_color) {
+		if (use_density)
+			compiler.stack_assign(density_out);
+		if (use_color)
+			compiler.stack_assign(color_out);
+
+		if(slot == -1) {
+			bool is_float, is_linear;
+			slot = image_manager->add_image(filename, builtin_data,
+			                                false, 0,
+			                                is_float, is_linear,
+			                                interpolation,
+			                                EXTENSION_REPEAT,
+			                                true);
+		}
+
+		if(slot != -1) {
+			compiler.stack_assign(vector_in);
+			compiler.add_node(NODE_TEX_VOXEL,
+			                  slot,
+			                  compiler.encode_uchar4(vector_in->stack_offset,
+			                                         density_out->stack_offset,
+			                                         color_out->stack_offset,
+			                                         space_enum[space]));
+			if(space == "World") {
+				compiler.add_node(tfm.x);
+				compiler.add_node(tfm.y);
+				compiler.add_node(tfm.z);
+				compiler.add_node(tfm.w);
+			}
+		}
+		else {
+			compiler.add_node(NODE_VALUE_F,
+			                  __float_as_int(0.0f),
+			                  density_out->stack_offset);
+			compiler.add_node(NODE_VALUE_V, color_out->stack_offset);
+			compiler.add_node(NODE_VALUE_V, make_float3(TEX_IMAGE_MISSING_R,
+			                                            TEX_IMAGE_MISSING_G,
+			                                            TEX_IMAGE_MISSING_B));
+		}
+	}
+}
+
+void PointDensityTextureNode::compile(OSLCompiler& compiler)
+{
+	ShaderOutput *density_out = output("Density");
+	ShaderOutput *color_out = output("Color");
+
+	const bool use_density = !density_out->links.empty();
+	const bool use_color = !color_out->links.empty();
+
+	image_manager = compiler.image_manager;
+
+	if (use_density || use_color) {
+		if(slot == -1) {
+			bool is_float, is_linear;
+			slot = image_manager->add_image(filename, builtin_data,
+			                                false, 0,
+			                                is_float, is_linear,
+			                                interpolation,
+			                                EXTENSION_REPEAT,
+			                                true);
+		}
+
+		if(slot != -1) {
+			compiler.parameter("filename", string_printf("@%d", slot).c_str());
+		}
+		if(space == "World") {
+			compiler.parameter("mapping", transform_transpose(tfm));
+			compiler.parameter("use_mapping", 1);
+		}
+		switch (interpolation) {
+			case INTERPOLATION_CLOSEST:
+				compiler.parameter("interpolation", "closest");
+				break;
+			case INTERPOLATION_CUBIC:
+				compiler.parameter("interpolation", "cubic");
+				break;
+			case INTERPOLATION_LINEAR:
+			default:
+				compiler.parameter("interpolation", "linear");
+				break;
+		}
+
+		compiler.add(this, "node_voxel_texture");
+	}
+}
+
 /* Normal */
 
 NormalNode::NormalNode()
@@ -1503,11 +1711,11 @@ ProxyNode::ProxyNode(ShaderSocketType type_)
 	add_output("Output", type);
 }
 
-void ProxyNode::compile(SVMCompiler& compiler)
+void ProxyNode::compile(SVMCompiler& /*compiler*/)
 {
 }
 
-void ProxyNode::compile(OSLCompiler& compiler)
+void ProxyNode::compile(OSLCompiler& /*compiler*/)
 {
 }
 
@@ -1516,6 +1724,8 @@ void ProxyNode::compile(OSLCompiler& compiler)
 BsdfNode::BsdfNode(bool scattering_)
 : ShaderNode("bsdf"), scattering(scattering_)
 {
+	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
+
 	add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
 	add_input("Normal", SHADER_SOCKET_NORMAL, ShaderInput::NORMAL);
 	add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -1583,7 +1793,7 @@ void BsdfNode::compile(SVMCompiler& compiler)
 	compile(compiler, NULL, NULL);
 }
 
-void BsdfNode::compile(OSLCompiler& compiler)
+void BsdfNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -1605,6 +1815,7 @@ ShaderEnum AnisotropicBsdfNode::distribution_enum = aniso_distribution_init();
 
 AnisotropicBsdfNode::AnisotropicBsdfNode()
 {
+	closure = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 	distribution = ustring("GGX");
 
 	add_input("Tangent", SHADER_SOCKET_VECTOR, ShaderInput::TANGENT);
@@ -1657,6 +1868,7 @@ ShaderEnum GlossyBsdfNode::distribution_enum = glossy_distribution_init();
 
 GlossyBsdfNode::GlossyBsdfNode()
 {
+	closure = CLOSURE_BSDF_MICROFACET_GGX_ID;
 	distribution = ustring("GGX");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.2f);
@@ -1695,6 +1907,7 @@ ShaderEnum GlassBsdfNode::distribution_enum = glass_distribution_init();
 
 GlassBsdfNode::GlassBsdfNode()
 {
+	closure = CLOSURE_BSDF_SHARP_GLASS_ID;
 	distribution = ustring("Sharp");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.0f);
@@ -1734,6 +1947,7 @@ ShaderEnum RefractionBsdfNode::distribution_enum = refraction_distribution_init(
 
 RefractionBsdfNode::RefractionBsdfNode()
 {
+	closure = CLOSURE_BSDF_REFRACTION_ID;
 	distribution = ustring("Sharp");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.0f);
@@ -1772,6 +1986,7 @@ ShaderEnum ToonBsdfNode::component_enum = toon_component_init();
 
 ToonBsdfNode::ToonBsdfNode()
 {
+	closure = CLOSURE_BSDF_DIFFUSE_TOON_ID;
 	component = ustring("Diffuse");
 
 	add_input("Size", SHADER_SOCKET_FLOAT, 0.5f);
@@ -1912,6 +2127,8 @@ bool SubsurfaceScatteringNode::has_bssrdf_bump()
 EmissionNode::EmissionNode()
 : ShaderNode("emission")
 {
+	special_type = SHADER_SPECIAL_TYPE_EMISSION;
+
 	add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
 	add_input("Strength", SHADER_SOCKET_FLOAT, 10.0f);
 	add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -1945,6 +2162,8 @@ void EmissionNode::compile(OSLCompiler& compiler)
 BackgroundNode::BackgroundNode()
 : ShaderNode("background")
 {
+	special_type = SHADER_SPECIAL_TYPE_BACKGROUND;
+
 	add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
 	add_input("Strength", SHADER_SOCKET_FLOAT, 1.0f);
 	add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -2072,7 +2291,7 @@ void VolumeNode::compile(SVMCompiler& compiler)
 	compile(compiler, NULL, NULL);
 }
 
-void VolumeNode::compile(OSLCompiler& compiler)
+void VolumeNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -2129,6 +2348,7 @@ ShaderEnum HairBsdfNode::component_enum = hair_component_init();
 
 HairBsdfNode::HairBsdfNode()
 {
+	closure = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	component = ustring("Reflection");
 
 	add_input("Offset", SHADER_SOCKET_FLOAT);
@@ -2165,13 +2385,18 @@ GeometryNode::GeometryNode()
 	add_output("Incoming", SHADER_SOCKET_VECTOR);
 	add_output("Parametric", SHADER_SOCKET_POINT);
 	add_output("Backfacing", SHADER_SOCKET_FLOAT);
+	add_output("Pointiness", SHADER_SOCKET_FLOAT);
 }
 
 void GeometryNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 	if(shader->has_surface) {
-		if(!output("Tangent")->links.empty())
+		if(!output("Tangent")->links.empty()) {
 			attributes->add(ATTR_STD_GENERATED);
+		}
+		if(!output("Pointiness")->links.empty()) {
+			attributes->add(ATTR_STD_POINTINESS);
+		}
 	}
 
 	ShaderNode::attributes(shader, attributes);
@@ -2181,11 +2406,16 @@ void GeometryNode::compile(SVMCompiler& compiler)
 {
 	ShaderOutput *out;
 	NodeType geom_node = NODE_GEOMETRY;
+	NodeType attr_node = NODE_ATTR;
 
-	if(bump == SHADER_BUMP_DX)
+	if(bump == SHADER_BUMP_DX) {
 		geom_node = NODE_GEOMETRY_BUMP_DX;
-	else if(bump == SHADER_BUMP_DY)
+		attr_node = NODE_ATTR_BUMP_DX;
+	}
+	else if(bump == SHADER_BUMP_DY) {
 		geom_node = NODE_GEOMETRY_BUMP_DY;
+		attr_node = NODE_ATTR_BUMP_DY;
+	}
 	
 	out = output("Position");
 	if(!out->links.empty()) {
@@ -2228,6 +2458,20 @@ void GeometryNode::compile(SVMCompiler& compiler)
 		compiler.stack_assign(out);
 		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_backfacing, out->stack_offset);
 	}
+
+	out = output("Pointiness");
+	if(!out->links.empty()) {
+		compiler.stack_assign(out);
+		if(compiler.output_type() != SHADER_TYPE_VOLUME) {
+			compiler.add_node(attr_node,
+			                  ATTR_STD_POINTINESS,
+			                  out->stack_offset,
+			                  NODE_ATTR_FLOAT);
+		}
+		else {
+			compiler.add_node(NODE_VALUE_F, __float_as_int(0.0f), out->stack_offset);
+		}
+	}
 }
 
 void GeometryNode::compile(OSLCompiler& compiler)
@@ -2257,6 +2501,8 @@ TextureCoordinateNode::TextureCoordinateNode()
 	add_output("Reflection", SHADER_SOCKET_NORMAL);
 
 	from_dupli = false;
+	use_transform = false;
+	ob_tfm = transform_identity();
 }
 
 void TextureCoordinateNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -2344,7 +2590,14 @@ void TextureCoordinateNode::compile(SVMCompiler& compiler)
 	out = output("Object");
 	if(!out->links.empty()) {
 		compiler.stack_assign(out);
-		compiler.add_node(texco_node, NODE_TEXCO_OBJECT, out->stack_offset);
+		compiler.add_node(texco_node, NODE_TEXCO_OBJECT, out->stack_offset, use_transform);
+		if(use_transform) {
+			Transform ob_itfm = transform_inverse(ob_tfm);
+			compiler.add_node(ob_itfm.x);
+			compiler.add_node(ob_itfm.y);
+			compiler.add_node(ob_itfm.z);
+			compiler.add_node(ob_itfm.w);
+		}
 	}
 
 	out = output("Camera");
@@ -2385,7 +2638,10 @@ void TextureCoordinateNode::compile(OSLCompiler& compiler)
 		compiler.parameter("is_background", true);
 	if(compiler.output_type() == SHADER_TYPE_VOLUME)
 		compiler.parameter("is_volume", true);
-	
+	compiler.parameter("use_transform", use_transform);
+	Transform ob_itfm = transform_transpose(transform_inverse(ob_tfm));
+	compiler.parameter("object_itfm", ob_itfm);
+
 	compiler.parameter("from_dupli", from_dupli);
 
 	compiler.add(this, "node_texture_coordinate");
@@ -2405,7 +2661,7 @@ void UVMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 	if(shader->has_surface) {
 		if(!from_dupli) {
 			if(!output("UV")->links.empty()) {
-				if (attribute != "")
+				if(attribute != "")
 					attributes->add(attribute);
 				else
 					attributes->add(ATTR_STD_UV);
@@ -2438,7 +2694,7 @@ void UVMapNode::compile(SVMCompiler& compiler)
 			compiler.add_node(texco_node, NODE_TEXCO_DUPLI_UV, out->stack_offset);
 		}
 		else {
-			if (attribute != "")
+			if(attribute != "")
 				attr = compiler.attribute(attribute);
 			else
 				attr = compiler.attribute(ATTR_STD_UV);
@@ -2879,7 +3135,7 @@ AddClosureNode::AddClosureNode()
 	add_output("Closure",  SHADER_SOCKET_CLOSURE);
 }
 
-void AddClosureNode::compile(SVMCompiler& compiler)
+void AddClosureNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* handled in the SVM compiler */
 }
@@ -2902,7 +3158,7 @@ MixClosureNode::MixClosureNode()
 	add_output("Closure",  SHADER_SOCKET_CLOSURE);
 }
 
-void MixClosureNode::compile(SVMCompiler& compiler)
+void MixClosureNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* handled in the SVM compiler */
 }
@@ -2940,7 +3196,7 @@ void MixClosureWeightNode::compile(SVMCompiler& compiler)
 			weight1_out->stack_offset, weight2_out->stack_offset));
 }
 
-void MixClosureWeightNode::compile(OSLCompiler& compiler)
+void MixClosureWeightNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -3540,14 +3796,34 @@ void WireframeNode::compile(SVMCompiler& compiler)
 {
 	ShaderInput *size_in = input("Size");
 	ShaderOutput *fac_out = output("Fac");
-
+	NodeBumpOffset bump_offset = NODE_BUMP_OFFSET_CENTER;
+	if(bump == SHADER_BUMP_DX) {
+		bump_offset = NODE_BUMP_OFFSET_DX;
+	}
+	else if(bump == SHADER_BUMP_DY) {
+		bump_offset = NODE_BUMP_OFFSET_DY;
+	}
 	compiler.stack_assign(size_in);
 	compiler.stack_assign(fac_out);
-	compiler.add_node(NODE_WIREFRAME, size_in->stack_offset, fac_out->stack_offset, use_pixel_size);
+	compiler.add_node(NODE_WIREFRAME,
+	                  size_in->stack_offset,
+	                  fac_out->stack_offset,
+	                  compiler.encode_uchar4(use_pixel_size,
+	                                         bump_offset,
+	                                         0, 0));
 }
 
 void WireframeNode::compile(OSLCompiler& compiler)
 {
+	if(bump == SHADER_BUMP_DX) {
+		compiler.parameter("bump_offset", "dx");
+	}
+	else if(bump == SHADER_BUMP_DY) {
+		compiler.parameter("bump_offset", "dy");
+	}
+	else {
+		compiler.parameter("bump_offset", "center");
+	}
 	compiler.parameter("use_pixel_size", use_pixel_size);
 	compiler.add(this, "node_wireframe");
 }
@@ -3590,9 +3866,17 @@ void BlackbodyNode::compile(SVMCompiler& compiler)
 	ShaderInput *temperature_in = input("Temperature");
 	ShaderOutput *color_out = output("Color");
 
-	compiler.stack_assign(temperature_in);
 	compiler.stack_assign(color_out);
-	compiler.add_node(NODE_BLACKBODY, temperature_in->stack_offset, color_out->stack_offset);
+
+	if(temperature_in->link == NULL) {
+		float3 color = svm_math_blackbody_color(temperature_in->value.x);
+		compiler.add_node(NODE_VALUE_V, color_out->stack_offset);
+		compiler.add_node(NODE_VALUE_V, color);
+	}
+	else {
+		compiler.stack_assign(temperature_in);
+		compiler.add_node(NODE_BLACKBODY, temperature_in->stack_offset, color_out->stack_offset);
+	}
 }
 
 void BlackbodyNode::compile(OSLCompiler& compiler)
@@ -3669,7 +3953,7 @@ static ShaderEnum math_type_init()
 	enm.insert("Less Than", NODE_MATH_LESS_THAN);
 	enm.insert("Greater Than", NODE_MATH_GREATER_THAN);
 	enm.insert("Modulo", NODE_MATH_MODULO);
-    enm.insert("Absolute", NODE_MATH_ABSOLUTE);
+	enm.insert("Absolute", NODE_MATH_ABSOLUTE);
 
 	return enm;
 }
@@ -3682,9 +3966,24 @@ void MathNode::compile(SVMCompiler& compiler)
 	ShaderInput *value2_in = input("Value2");
 	ShaderOutput *value_out = output("Value");
 
+	compiler.stack_assign(value_out);
+
+	/* Optimize math node without links to a single value node. */
+	if(value1_in->link == NULL && value2_in->link == NULL) {
+		float optimized_value = svm_math((NodeMath)type_enum[type],
+		                                 value1_in->value.x,
+		                                 value2_in->value.x);
+		if(use_clamp) {
+			optimized_value = saturate(optimized_value);
+		}
+		compiler.add_node(NODE_VALUE_F,
+		                  __float_as_int(optimized_value),
+		                  value_out->stack_offset);
+		return;
+	}
+
 	compiler.stack_assign(value1_in);
 	compiler.stack_assign(value2_in);
-	compiler.stack_assign(value_out);
 
 	compiler.add_node(NODE_MATH, type_enum[type], value1_in->stack_offset, value2_in->stack_offset);
 	compiler.add_node(NODE_MATH, value_out->stack_offset);
@@ -3738,11 +4037,31 @@ void VectorMathNode::compile(SVMCompiler& compiler)
 	ShaderOutput *value_out = output("Value");
 	ShaderOutput *vector_out = output("Vector");
 
-	compiler.stack_assign(vector1_in);
-	compiler.stack_assign(vector2_in);
 	compiler.stack_assign(value_out);
 	compiler.stack_assign(vector_out);
 
+	/* Optimize vector math node without links to a single value node. */
+	if(vector1_in->link == NULL && vector2_in->link == NULL) {
+		float optimized_value;
+		float3 optimized_vector;
+		svm_vector_math(&optimized_value,
+		                &optimized_vector,
+		                (NodeVectorMath)type_enum[type],
+		                vector1_in->value,
+		                vector2_in->value);
+
+		compiler.add_node(NODE_VALUE_F,
+		                  __float_as_int(optimized_value),
+		                  value_out->stack_offset);
+
+		compiler.add_node(NODE_VALUE_V, vector_out->stack_offset);
+		compiler.add_node(NODE_VALUE_V, optimized_vector);
+		return;
+	}
+
+	compiler.stack_assign(vector1_in);
+	compiler.stack_assign(vector2_in);
+
 	compiler.add_node(NODE_VECTOR_MATH, type_enum[type], vector1_in->stack_offset, vector2_in->stack_offset);
 	compiler.add_node(NODE_VECTOR_MATH, value_out->stack_offset, vector_out->stack_offset);
 }
@@ -3819,6 +4138,8 @@ BumpNode::BumpNode()
 {
 	invert = false;
 
+	special_type = SHADER_SPECIAL_TYPE_BUMP;
+
 	/* this input is used by the user, but after graph transform it is no longer
 	 * used and moved to sampler center/x/y instead */
 	add_input("Height", SHADER_SOCKET_FLOAT);
@@ -3895,7 +4216,7 @@ void RGBCurvesNode::compile(OSLCompiler& compiler)
 {
 	float ramp[RAMP_TABLE_SIZE][3];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp[i][0] = curves[i].x;
 		ramp[i][1] = curves[i].y;
 		ramp[i][2] = curves[i].z;
@@ -3933,7 +4254,7 @@ void VectorCurvesNode::compile(OSLCompiler& compiler)
 {
 	float ramp[RAMP_TABLE_SIZE][3];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp[i][0] = curves[i].x;
 		ramp[i][1] = curves[i].y;
 		ramp[i][2] = curves[i].z;
@@ -3983,7 +4304,7 @@ void RGBRampNode::compile(OSLCompiler& compiler)
 	float ramp_color[RAMP_TABLE_SIZE][3];
 	float ramp_alpha[RAMP_TABLE_SIZE];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp_color[i][0] = ramp[i].x;
 		ramp_color[i][1] = ramp[i].y;
 		ramp_color[i][2] = ramp[i].z;
@@ -4030,7 +4351,7 @@ OSLScriptNode::OSLScriptNode()
 	special_type = SHADER_SPECIAL_TYPE_SCRIPT;
 }
 
-void OSLScriptNode::compile(SVMCompiler& compiler)
+void OSLScriptNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* doesn't work for SVM, obviously ... */
 }
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 31b6f4e50c4..5065e68345a 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __NODES_H__
@@ -55,13 +55,28 @@ public:
 
 /* Nodes */
 
+/* Any node which uses image manager's slot should be a subclass of this one. */
+class ImageSlotNode : public ShaderNode {
+public:
+	ImageSlotNode(const char *name_) : ShaderNode(name_) {
+		special_type = SHADER_SPECIAL_TYPE_IMAGE_SLOT;
+	}
+	int slot;
+};
+
 class TextureNode : public ShaderNode {
 public:
 	TextureNode(const char *name_) : ShaderNode(name_) {}
 	TextureMapping tex_mapping;
 };
 
-class ImageTextureNode : public TextureNode {
+class ImageSlotTextureNode : public ImageSlotNode {
+public:
+	ImageSlotTextureNode(const char *name_) : ImageSlotNode(name_) {}
+	TextureMapping tex_mapping;
+};
+
+class ImageTextureNode : public ImageSlotTextureNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(ImageTextureNode)
 	~ImageTextureNode();
@@ -69,7 +84,6 @@ public:
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 
 	ImageManager *image_manager;
-	int slot;
 	int is_float;
 	bool is_linear;
 	bool use_alpha;
@@ -78,6 +92,7 @@ public:
 	ustring color_space;
 	ustring projection;
 	InterpolationType interpolation;
+	ExtensionType extension;
 	float projection_blend;
 	bool animated;
 
@@ -85,15 +100,15 @@ public:
 	static ShaderEnum projection_enum;
 };
 
-class EnvironmentTextureNode : public TextureNode {
+class EnvironmentTextureNode : public ImageSlotTextureNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(EnvironmentTextureNode)
 	~EnvironmentTextureNode();
 	ShaderNode *clone() const;
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	ImageManager *image_manager;
-	int slot;
 	int is_float;
 	bool is_linear;
 	bool use_alpha;
@@ -111,6 +126,8 @@ class SkyTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(SkyTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	float3 sun_direction;
 	float turbidity;
 	float ground_albedo;
@@ -128,6 +145,8 @@ class GradientTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(GradientTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 	static ShaderEnum type_enum;
 };
@@ -141,6 +160,8 @@ class VoronoiTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(VoronoiTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring coloring;
 
 	static ShaderEnum coloring_enum;
@@ -150,6 +171,8 @@ class MusgraveTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(MusgraveTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 
 	static ShaderEnum type_enum;
@@ -159,6 +182,8 @@ class WaveTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(WaveTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 	static ShaderEnum type_enum;
 };
@@ -167,12 +192,16 @@ class MagicTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(MagicTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	int depth;
 };
 
 class CheckerTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(CheckerTextureNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 };
 
 class BrickTextureNode : public TextureNode {
@@ -181,11 +210,37 @@ public:
 	
 	float offset, squash;
 	int offset_frequency, squash_frequency;
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+};
+
+class PointDensityTextureNode : public ShaderNode {
+public:
+	SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
+
+	~PointDensityTextureNode();
+	ShaderNode *clone() const;
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+
+	bool has_spatial_varying() { return true; }
+	bool has_object_dependency() { return true; }
+
+	ImageManager *image_manager;
+	int slot;
+	string filename;
+	ustring space;
+	void *builtin_data;
+	InterpolationType interpolation;
+
+	Transform tfm;
+
+	static ShaderEnum space_enum;
 };
 
 class MappingNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MappingNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	TextureMapping tex_mapping;
 };
@@ -308,6 +363,7 @@ public:
 class HoldoutNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(HoldoutNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class AmbientOcclusionNode : public ShaderNode {
@@ -315,6 +371,7 @@ public:
 	SHADER_NODE_CLASS(AmbientOcclusionNode)
 
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class VolumeNode : public ShaderNode {
@@ -322,6 +379,7 @@ public:
 	SHADER_NODE_CLASS(VolumeNode)
 
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2);
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ClosureType closure;
 };
@@ -357,8 +415,11 @@ public:
 	SHADER_NODE_CLASS(TextureCoordinateNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
-	
+	bool has_object_dependency() { return use_transform; }
+
 	bool from_dupli;
+	bool use_transform;
+	Transform ob_tfm;
 };
 
 class UVMapNode : public ShaderNode {
@@ -366,6 +427,7 @@ public:
 	SHADER_NODE_CLASS(UVMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ustring attribute;
 	bool from_dupli;
@@ -374,6 +436,7 @@ public:
 class LightPathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LightPathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class LightFalloffNode : public ShaderNode {
@@ -385,12 +448,14 @@ public:
 class ObjectInfoNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(ObjectInfoNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class ParticleInfoNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(ParticleInfoNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class HairInfoNode : public ShaderNode {
@@ -399,6 +464,10 @@ public:
 
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
+	virtual int get_feature() {
+		return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+	}
 };
 
 class ValueNode : public ShaderNode {
@@ -433,12 +502,16 @@ public:
 class InvertNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(InvertNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class MixNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MixNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	bool use_clamp;
 
 	ustring type;
@@ -448,41 +521,55 @@ public:
 class CombineRGBNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineRGBNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class CombineHSVNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineHSVNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class CombineXYZNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineXYZNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class GammaNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(GammaNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class BrightContrastNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BrightContrastNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class SeparateRGBNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateRGBNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class SeparateHSVNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateHSVNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class SeparateXYZNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateXYZNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class HSVNode : public ShaderNode {
@@ -509,18 +596,21 @@ class FresnelNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(FresnelNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class LayerWeightNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LayerWeightNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class WireframeNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(WireframeNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 	
 	bool use_pixel_size;
 };
@@ -528,18 +618,21 @@ public:
 class WavelengthNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(WavelengthNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class BlackbodyNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BlackbodyNode)
-	
-	bool has_converter_blackbody() { return true; }
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class MathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	bool use_clamp;
 
@@ -550,6 +643,7 @@ public:
 class NormalNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(NormalNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	float3 direction;
 };
@@ -557,6 +651,7 @@ public:
 class VectorMathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorMathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ustring type;
 	static ShaderEnum type_enum;
@@ -566,6 +661,8 @@ class VectorTransformNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorTransformNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	ustring type;
 	ustring convert_from;
 	ustring convert_to;
@@ -578,6 +675,9 @@ class BumpNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BumpNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_feature() {
+		return NODE_FEATURE_BUMP;
+	}
 
 	bool invert;
 };
@@ -585,12 +685,18 @@ public:
 class RGBCurvesNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(RGBCurvesNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	float4 curves[RAMP_TABLE_SIZE];
 };
 
 class VectorCurvesNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorCurvesNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	float4 curves[RAMP_TABLE_SIZE];
 };
 
@@ -599,6 +705,7 @@ public:
 	SHADER_NODE_CLASS(RGBRampNode)
 	float4 ramp[RAMP_TABLE_SIZE];
 	bool interpolate;
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class SetNormalNode : public ShaderNode {
@@ -627,6 +734,7 @@ public:
 	SHADER_NODE_CLASS(NormalMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	ustring space;
 	static ShaderEnum space_enum;
@@ -639,6 +747,7 @@ public:
 	SHADER_NODE_CLASS(TangentNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	ustring direction_type;
 	static ShaderEnum direction_type_enum;
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 1f148d34ea6..ec85aa8f80b 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -11,9 +11,10 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+#include "camera.h"
 #include "device.h"
 #include "light.h"
 #include "mesh.h"
@@ -23,6 +24,7 @@
 #include "scene.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_vector.h"
@@ -75,8 +77,14 @@ void Object::compute_bounds(bool motion_blur)
 			bounds.grow(mbounds.transformed(&ttfm));
 		}
 	}
-	else
-		bounds = mbounds.transformed(&tfm);
+	else {
+		if(mesh->transform_applied) {
+			bounds = mbounds;
+		}
+		else {
+			bounds = mbounds.transformed(&tfm);
+		}
+	}
 }
 
 void Object::apply_transform(bool apply_to_motion)
@@ -98,11 +106,11 @@ void Object::apply_transform(bool apply_to_motion)
 		if(apply_to_motion) {
 			Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-			if (attr) {
+			if(attr) {
 				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
 				float3 *vert_steps = attr->data_float3();
 
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					vert_steps[i] = transform_point(&tfm, vert_steps[i]);
 			}
 
@@ -113,7 +121,7 @@ void Object::apply_transform(bool apply_to_motion)
 				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
 				float3 *normal_steps = attr_N->data_float3();
 
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
 			}
 		}
@@ -140,12 +148,12 @@ void Object::apply_transform(bool apply_to_motion)
 		if(apply_to_motion) {
 			Attribute *curve_attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-			if (curve_attr) {
+			if(curve_attr) {
 				/* apply transform to motion curve keys */
 				size_t steps_size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
 				float4 *key_steps = curve_attr->data_float4();
 
-				for (size_t i = 0; i < steps_size; i++) {
+				for(size_t i = 0; i < steps_size; i++) {
 					float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
 					float radius = key_steps[i].w * scalar;
 
@@ -185,6 +193,7 @@ void Object::tag_update(Scene *scene)
 		}
 	}
 
+	scene->camera->need_flags_update = true;
 	scene->curve_system_manager->need_update = true;
 	scene->mesh_manager->need_update = true;
 	scene->object_manager->need_update = true;
@@ -215,6 +224,7 @@ vector<float> Object::motion_times()
 ObjectManager::ObjectManager()
 {
 	need_update = true;
+	need_flags_update = true;
 }
 
 ObjectManager::~ObjectManager()
@@ -312,6 +322,9 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 				mtfm_pre = mtfm_pre * itfm;
 				mtfm_post = mtfm_post * itfm;
 			}
+			else {
+				flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+			}
 
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
@@ -367,13 +380,13 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 
 void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->objects.size() << " objects.";
+
 	if(!need_update)
 		return;
 	
 	device_free(device, dscene);
 
-	need_update = false;
-
 	if(scene->objects.size() == 0)
 		return;
 
@@ -392,6 +405,65 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 		progress.set_status("Updating Objects", "Applying Static Transformations");
 		apply_static_transforms(dscene, scene, object_flag, progress);
 	}
+}
+
+void ObjectManager::device_update_flags(Device *device,
+                                        DeviceScene *dscene,
+                                        Scene *scene,
+                                        Progress& /*progress*/,
+                                        bool bounds_valid)
+{
+	if(!need_update && !need_flags_update)
+		return;
+
+	need_update = false;
+	need_flags_update = false;
+
+	if(scene->objects.size() == 0)
+		return;
+
+	/* object info flag */
+	uint *object_flag = dscene->object_flag.get_data();
+
+	vector<Object *> volume_objects;
+	bool has_volume_objects = false;
+	foreach(Object *object, scene->objects) {
+		if(object->mesh->has_volume) {
+			if(bounds_valid) {
+				volume_objects.push_back(object);
+			}
+			has_volume_objects = true;
+		}
+	}
+
+	int object_index = 0;
+	foreach(Object *object, scene->objects) {
+		if(object->mesh->has_volume) {
+			object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
+		}
+		else {
+			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME;
+		}
+
+		if(bounds_valid) {
+			foreach(Object *volume_object, volume_objects) {
+				if(object == volume_object) {
+					continue;
+				}
+				if(object->bounds.intersects(volume_object->bounds)) {
+					object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+					break;
+				}
+			}
+		}
+		else if(has_volume_objects) {
+			/* Not really valid, but can't make more reliable in the case
+			 * of bounds not being up to date.
+			 */
+			object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+		}
+		++object_index;
+	}
 
 	/* allocate object flag */
 	device->tex_alloc("__object_flag", dscene->object_flag);
@@ -422,6 +494,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 	bool apply_to_motion = need_motion != Scene::MOTION_PASS;
 #else
 	bool motion_blur = false;
+	bool apply_to_motion = false;
 #endif
 	int i = 0;
 	bool have_instancing = false;
@@ -439,7 +512,9 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 
 	/* apply transforms for objects with single user meshes */
 	foreach(Object *object, scene->objects) {
-		if(mesh_users[object->mesh] == 1) {
+		if(mesh_users[object->mesh] == 1 &&
+		   object->mesh->displacement_method == Mesh::DISPLACE_BUMP)
+		{
 			if(!(motion_blur && object->use_motion)) {
 				if(!object->mesh->transform_applied) {
 					object->apply_transform(apply_to_motion);
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 677526b715f..379d1748cdd 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OBJECT_H__
@@ -70,12 +70,18 @@ public:
 class ObjectManager {
 public:
 	bool need_update;
+	bool need_flags_update;
 
 	ObjectManager();
 	~ObjectManager();
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+	void device_update_flags(Device *device,
+	                         DeviceScene *dscene,
+	                         Scene *scene,
+	                         Progress& progress,
+	                         bool bounds_valid = true);
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index f57e16471a1..a02f91ad2cf 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -30,6 +30,7 @@
 #include "osl_shader.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_md5.h"
 #include "util_path.h"
 #include "util_progress.h"
@@ -66,7 +67,7 @@ OSLShaderManager::~OSLShaderManager()
 	texture_system_free();
 }
 
-void OSLShaderManager::reset(Scene *scene)
+void OSLShaderManager::reset(Scene * /*scene*/)
 {
 	shading_system_free();
 	shading_system_init();
@@ -74,6 +75,8 @@ void OSLShaderManager::reset(Scene *scene)
 
 void OSLShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	if(!need_update)
 		return;
 
@@ -188,12 +191,14 @@ void OSLShaderManager::shading_system_init()
 	if(ss_shared_users == 0) {
 		services_shared = new OSLRenderServices();
 
-		ss_shared = OSL::ShadingSystem::create(services_shared, ts_shared, &errhandler);
+		ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
 		ss_shared->attribute("lockgeom", 1);
 		ss_shared->attribute("commonspace", "world");
 		ss_shared->attribute("searchpath:shader", path_get("shader"));
 		//ss_shared->attribute("greedyjit", 1);
 
+		VLOG(1) << "Using shader search path: " << path_get("shader");
+
 		/* our own ray types */
 		static const char *raytypes[] = {
 			"camera",			/* PATH_RAY_CAMERA */
@@ -208,9 +213,9 @@ void OSLShaderManager::shading_system_init()
 
 			"__unused__",
 			"__unused__",
-			"diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-			"glossy_ancestor",  /* PATH_RAY_GLOSSY_ANCESTOR */
-			"bssrdf_ancestor",  /* PATH_RAY_BSSRDF_ANCESTOR */
+			"diffuse_ancestor",	/* PATH_RAY_DIFFUSE_ANCESTOR */
+			"__unused__",
+			"__unused__",
 			"__unused__",		/* PATH_RAY_SINGLE_PASS_DONE */
 			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
 		};
@@ -235,7 +240,7 @@ void OSLShaderManager::shading_system_free()
 	ss_shared_users--;
 
 	if(ss_shared_users == 0) {
-		OSL::ShadingSystem::destroy(ss_shared);
+		delete ss_shared;
 		ss_shared = NULL;
 
 		delete services_shared;
@@ -248,11 +253,11 @@ void OSLShaderManager::shading_system_free()
 
 bool OSLShaderManager::osl_compile(const string& inputfile, const string& outputfile)
 {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-	typedef string string_view;
-#endif
-
+#if OSL_LIBRARY_VERSION_CODE < 10602
 	vector<string_view> options;
+#else
+	vector<string> options;
+#endif
 	string stdosl_path;
 	string shader_path = path_get("shader");
 
@@ -261,13 +266,13 @@ bool OSLShaderManager::osl_compile(const string& inputfile, const string& output
 	options.push_back(outputfile);
 
 	/* specify standard include path */
-	options.push_back("-I");
-	options.push_back(shader_path);
+	string include_path_arg = string("-I") + shader_path;
+	options.push_back(include_path_arg);
 
 	stdosl_path = path_get("shader/stdosl.h");
 
 	/* compile */
-	OSL::OSLCompiler *compiler = OSL::OSLCompiler::create();
+	OSL::OSLCompiler *compiler = new OSL::OSLCompiler();
 	bool ok = compiler->compile(string_view(inputfile), options, string_view(stdosl_path));
 	delete compiler;
 
@@ -411,7 +416,7 @@ string OSLCompiler::compatible_name(ShaderNode *node, ShaderInput *input)
 	
 	/* if output exists with the same name, add "In" suffix */
 	foreach(ShaderOutput *output, node->outputs) {
-		if (strcmp(input->name, output->name)==0) {
+		if(strcmp(input->name, output->name)==0) {
 			sname += "In";
 			break;
 		}
@@ -431,7 +436,7 @@ string OSLCompiler::compatible_name(ShaderNode *node, ShaderOutput *output)
 	
 	/* if input exists with the same name, add "Out" suffix */
 	foreach(ShaderInput *input, node->inputs) {
-		if (strcmp(input->name, output->name)==0) {
+		if(strcmp(input->name, output->name)==0) {
 			sname += "Out";
 			break;
 		}
@@ -564,6 +569,10 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 		if(node->has_spatial_varying())
 			current_shader->has_heterogeneous_volume = true;
 	}
+
+	if(node->has_object_dependency()) {
+		current_shader->has_object_dependency = true;
+	}
 }
 
 void OSLCompiler::parameter(const char *name, float f)
@@ -748,11 +757,7 @@ OSL::ShadingAttribStateRef OSLCompiler::compile_type(Shader *shader, ShaderGraph
 
 	current_type = type;
 
-#if OSL_LIBRARY_VERSION_CODE >= 10501
 	OSL::ShadingAttribStateRef group = ss->ShaderGroupBegin(shader->name.c_str());
-#else
-	ss->ShaderGroupBegin(shader->name.c_str());
-#endif
 
 	ShaderNode *output = graph->output();
 	set<ShaderNode*> dependencies;
@@ -780,13 +785,7 @@ OSL::ShadingAttribStateRef OSLCompiler::compile_type(Shader *shader, ShaderGraph
 
 	ss->ShaderGroupEnd();
 
-#if OSL_LIBRARY_VERSION_CODE >= 10501
-	return group;
-#else
-	OSL::ShadingAttribStateRef group = ss->state();
-	ss->clear_state();
 	return group;
-#endif
 }
 
 void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
@@ -815,6 +814,7 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
 		shader->has_volume = false;
 		shader->has_displacement = false;
 		shader->has_heterogeneous_volume = false;
+		shader->has_object_dependency = false;
 
 		/* generate surface shader */
 		if(shader->used && graph && output->input("Surface")->link) {
@@ -862,75 +862,75 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
 
 #else
 
-void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
+void OSLCompiler::add(ShaderNode * /*node*/, const char * /*name*/, bool /*isfilepath*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, float f)
+void OSLCompiler::parameter(const char * /*name*/, float /*f*/)
 {
 }
 
-void OSLCompiler::parameter_color(const char *name, float3 f)
+void OSLCompiler::parameter_color(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_vector(const char *name, float3 f)
+void OSLCompiler::parameter_vector(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_point(const char *name, float3 f)
+void OSLCompiler::parameter_point(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_normal(const char *name, float3 f)
+void OSLCompiler::parameter_normal(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, int f)
+void OSLCompiler::parameter(const char * /*name*/, int /*f*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, const char *s)
+void OSLCompiler::parameter(const char * /*name*/, const char * /*s*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, ustring s)
+void OSLCompiler::parameter(const char * /*name*/, ustring /*s*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, const Transform& tfm)
+void OSLCompiler::parameter(const char * /*name*/, const Transform& /*tfm*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const float f[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const float /*f*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_color_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_color_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_vector_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_vector_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_normal_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_normal_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_point_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_point_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const int f[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const int /*f*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const char * const s[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const char * const /*s*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const Transform tfm[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const Transform /*tfm*/[], int /*arraylen*/)
 {
 }
 
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index 5824e2ace64..bc6a9d8fbbd 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_H__
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 255effa86dd..8f9e8c6d639 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -19,6 +19,7 @@
 #include "scene.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_vector.h"
@@ -92,6 +93,9 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 
 void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->particle_systems.size()
+	        << " particle systems.";
+
 	if(!need_update)
 		return;
 	
@@ -111,7 +115,7 @@ void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->particles.clear();
 }
 
-void ParticleSystemManager::tag_update(Scene *scene)
+void ParticleSystemManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h
index a606722f9c3..bf2b6b77015 100644
--- a/intern/cycles/render/particles.h
+++ b/intern/cycles/render/particles.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __PARTICLES_H__
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index d0de8c51300..19d715d834b 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -36,6 +36,11 @@
 #include "util_foreach.h"
 #include "util_progress.h"
 
+#ifdef WITH_CYCLES_DEBUG
+#  include "util_guarded_allocator.h"
+#  include "util_logging.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
@@ -153,76 +158,98 @@ void Scene::device_update(Device *device_, Progress& progress)
 	progress.set_status("Updating Shaders");
 	shader_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
-
-	progress.set_status("Updating Images");
-	image_manager->device_update(device, &dscene, progress);
-
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Background");
 	background->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
+
+	progress.set_status("Updating Camera");
+	camera->device_update(device, &dscene, this);
+
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Objects");
 	object_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
+
+	progress.set_status("Updating Meshes Flags");
+	mesh_manager->device_update_flags(device, &dscene, this, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Meshes");
 	mesh_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
+
+	progress.set_status("Updating Objects Flags");
+	object_manager->device_update_flags(device, &dscene, this, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
+
+	progress.set_status("Updating Images");
+	image_manager->device_update(device, &dscene, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
+
+	progress.set_status("Updating Camera Volume");
+	camera->device_update_volume(device, &dscene, this);
+
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Hair Systems");
 	curve_system_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
-	if(progress.get_cancel()) return;
-
-	/* TODO(sergey): Make sure camera is not needed above. */
-	progress.set_status("Updating Camera");
-	camera->device_update(device, &dscene, this);
-
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lights");
 	light_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Particle Systems");
 	particle_system_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Film");
 	film->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Integrator");
 	integrator->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Baking");
 	bake_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
+
+	if(device->have_error() == false) {
+		progress.set_status("Updating Device", "Writing constant memory");
+		device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
+	}
 
-	progress.set_status("Updating Device", "Writing constant memory");
-	device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
+#ifdef WITH_CYCLES_DEBUG
+	VLOG(1) << "System memory statistics after full device sync:\n"
+	        << "  Usage: " << util_guarded_get_mem_used() << "\n"
+	        << "  Peak: " << util_guarded_get_mem_peak();
+#endif
 }
 
 Scene::MotionType Scene::need_motion(bool advanced_shading)
@@ -272,7 +299,8 @@ bool Scene::need_reset()
 		|| shader_manager->need_update
 		|| particle_system_manager->need_update
 		|| curve_system_manager->need_update
-		|| bake_manager->need_update);
+		|| bake_manager->need_update
+		|| film->need_update);
 }
 
 void Scene::reset()
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 5d205225d97..851e5ac0b72 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SCENE_H__
@@ -26,6 +26,7 @@
 
 #include "util_param.h"
 #include "util_string.h"
+#include "util_system.h"
 #include "util_thread.h"
 #include "util_types.h"
 #include "util_vector.h"
@@ -61,6 +62,7 @@ class DeviceScene {
 public:
 	/* BVH */
 	device_vector<float4> bvh_nodes;
+	device_vector<float4> bvh_leaf_nodes;
 	device_vector<uint> object_node;
 	device_vector<float4> tri_woop;
 	device_vector<uint> prim_type;
@@ -135,11 +137,7 @@ public:
 		bvh_type = BVH_DYNAMIC;
 		use_bvh_cache = false;
 		use_bvh_spatial_split = false;
-#ifdef __QBVH__
-		use_qbvh = true;
-#else
 		use_qbvh = false;
-#endif
 		persistent_data = false;
 	}
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 9fcd9fa85f5..837c2694894 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <string.h>
@@ -20,13 +20,17 @@
 #include "buffers.h"
 #include "camera.h"
 #include "device.h"
+#include "graph.h"
 #include "integrator.h"
+#include "mesh.h"
+#include "object.h"
 #include "scene.h"
 #include "session.h"
 #include "bake.h"
 
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_math.h"
 #include "util_opengl.h"
 #include "util_task.h"
@@ -77,6 +81,9 @@ Session::Session(const SessionParams& params_)
 	gpu_need_tonemap = false;
 	pause = false;
 	kernels_loaded = false;
+
+	/* TODO(sergey): Check if it's indeed optimal value for the split kernel. */
+	max_closure_global = 1;
 }
 
 Session::~Session()
@@ -199,8 +206,7 @@ void Session::run_gpu()
 	paused_time = 0.0;
 	last_update_time = time_dt();
 
-	if(!params.background)
-		progress.set_start_time(start_time + paused_time);
+	progress.set_render_start_time(start_time + paused_time);
 
 	while(!progress.get_cancel()) {
 		/* advance to next tile */
@@ -233,6 +239,7 @@ void Session::run_gpu()
 
 					if(!params.background)
 						progress.set_start_time(start_time + paused_time);
+					progress.set_render_start_time(start_time + paused_time);
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -251,7 +258,7 @@ void Session::run_gpu()
 			update_scene();
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			if(progress.get_cancel())
 				break;
@@ -292,7 +299,7 @@ void Session::run_gpu()
 			}
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			tiles_written = update_progressive_refine(progress.get_cancel());
 
@@ -405,6 +412,11 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 		if(tile_buffers.size() == 0)
 			tile_buffers.resize(tile_manager.state.num_tiles, NULL);
 
+		/* In certain circumstances number of tiles in the tile manager could
+		 * be changed. This is not supported by the progressive refine feature.
+		 */
+		assert(tile_buffers.size() == tile_manager.state.num_tiles);
+
 		tilebuffers = tile_buffers[tile.index];
 		if(tilebuffers == NULL) {
 			tilebuffers = new RenderBuffers(tile_device);
@@ -517,6 +529,7 @@ void Session::run_cpu()
 
 					if(!params.background)
 						progress.set_start_time(start_time + paused_time);
+					progress.set_render_start_time(start_time + paused_time);
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -540,7 +553,7 @@ void Session::run_cpu()
 			update_scene();
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			if(progress.get_cancel())
 				break;
@@ -558,7 +571,7 @@ void Session::run_cpu()
 				need_tonemap = true;
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 		}
 
 		device->task_wait();
@@ -580,7 +593,7 @@ void Session::run_cpu()
 			}
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			tiles_written = update_progressive_refine(progress.get_cancel());
 		}
@@ -592,6 +605,45 @@ void Session::run_cpu()
 		update_progressive_refine(true);
 }
 
+DeviceRequestedFeatures Session::get_requested_device_features()
+{
+	/* TODO(sergey): Consider moving this to the Scene level. */
+	DeviceRequestedFeatures requested_features;
+	requested_features.experimental = params.experimental;
+	if(!params.background) {
+		requested_features.max_closure = 64;
+		requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+		requested_features.nodes_features = NODE_FEATURE_ALL;
+	}
+	else {
+		requested_features.max_closure = get_max_closure_count();
+		scene->shader_manager->get_requested_features(
+		        scene,
+		        requested_features.max_nodes_group,
+		        requested_features.nodes_features);
+	}
+
+	/* This features are not being tweaked as often as shaders,
+	 * so could be done selective magic for the viewport as well.
+	 */
+	requested_features.use_hair = false;
+	requested_features.use_object_motion = false;
+	requested_features.use_camera_motion = scene->camera->use_motion;
+	foreach(Object *object, scene->objects) {
+		Mesh *mesh = object->mesh;
+		if(mesh->curves.size() > 0) {
+			requested_features.use_hair = true;
+		}
+		requested_features.use_object_motion |= object->use_motion | mesh->use_motion_blur;
+		requested_features.use_camera_motion |= mesh->use_motion_blur;
+	}
+
+	BakeManager *bake_manager = scene->bake_manager;
+	requested_features.use_baking = bake_manager->get_baking();
+
+	return requested_features;
+}
+
 void Session::load_kernels()
 {
 	thread_scoped_lock scene_lock(scene->mutex);
@@ -599,12 +651,14 @@ void Session::load_kernels()
 	if(!kernels_loaded) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
-		if(!device->load_kernels(params.experimental)) {
+		DeviceRequestedFeatures requested_features = get_requested_device_features();
+		VLOG(2) << "Requested features:\n" << requested_features;
+		if(!device->load_kernels(requested_features)) {
 			string message = device->error_message();
 			if(message.empty())
 				message = "Failed loading render kernel, see console for errors";
 
-			progress.set_cancel(message);
+			progress.set_error(message);
 			progress.set_status("Error", message);
 			progress.set_update();
 			return;
@@ -665,7 +719,8 @@ void Session::reset_(BufferParams& buffer_params, int samples)
 	paused_time = 0.0;
 
 	if(!params.background)
-		progress.set_start_time(start_time + paused_time);
+		progress.set_start_time(start_time);
+	progress.set_render_start_time(start_time);
 }
 
 void Session::reset(BufferParams& buffer_params, int samples)
@@ -776,13 +831,21 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	string status, substatus;
 
 	if(!params.progressive) {
-		bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
-		bool is_multidevice = params.device.multi_devices.size() > 1;
-		bool is_cpu = params.device.type == DEVICE_CPU;
+		const int progress_sample = progress.get_sample(), num_samples = tile_manager.num_samples;
+		const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
+		const bool is_multidevice = params.device.multi_devices.size() > 1;
+		const bool is_cpu = params.device.type == DEVICE_CPU;
+		const bool is_last_tile = (num_samples * num_tiles - progress_sample) < num_samples;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) {
+		if((is_gpu && !is_multidevice && !device->info.use_split_kernel) ||
+		   (is_cpu && (num_tiles == 1 || is_last_tile)))
+		{
+			/* When using split-kernel (OpenCL) each thread in a tile will be working on a different
+			 * sample. Can't display sample number when device uses split-kernel
+			 */
+
 			/* when rendering on GPU multithreading happens within single tile, as in
 			 * tiles are handling sequentially and in this case we could display
 			 * currently rendering sample number
@@ -790,17 +853,21 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 * also display the info on CPU, when using 1 tile only
 			 */
 
-			int sample = progress.get_sample(), num_samples = tile_manager.num_samples;
-
+			int status_sample = progress_sample;
 			if(tile > 1) {
 				/* sample counter is global for all tiles, subtract samples
 				 * from already finished tiles to get sample counter for
 				 * current tile only
 				 */
-				sample -= (tile - 1) * num_samples;
+				if(is_cpu && is_last_tile && num_tiles > 1) {
+					status_sample = num_samples - (num_samples * num_tiles - progress_sample);
+				}
+				else {
+					status_sample -= (tile - 1) * num_samples;
+				}
 			}
 
-			substatus += string_printf(", Sample %d/%d", sample, num_samples);
+			substatus += string_printf(", Sample %d/%d", status_sample, num_samples);
 		}
 	}
 	else if(tile_manager.num_samples == USHRT_MAX)
@@ -850,6 +917,7 @@ void Session::path_trace()
 	task.update_progress_sample = function_bind(&Session::update_progress_sample, this);
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
+	task.requested_tile_size = params.tile_size;
 
 	device->task_add(task);
 }
@@ -887,9 +955,9 @@ bool Session::update_progressive_refine(bool cancel)
 
 	double current_time = time_dt();
 
-	if (current_time - last_update_time < 1.0) {
+	if(current_time - last_update_time < params.progressive_update_timeout) {
 		/* if last sample was processed, we need to write buffers anyway  */
-		if (!write)
+		if(!write && sample != 1)
 			return false;
 	}
 
@@ -899,10 +967,14 @@ bool Session::update_progressive_refine(bool cancel)
 			rtile.buffers = buffers;
 			rtile.sample = sample;
 
-			if(write)
-				write_render_tile_cb(rtile);
-			else
-				update_render_tile_cb(rtile);
+			if(write) {
+				if(write_render_tile_cb)
+					write_render_tile_cb(rtile);
+			}
+			else {
+				if(update_render_tile_cb)
+					update_render_tile_cb(rtile);
+			}
 		}
 	}
 
@@ -925,4 +997,15 @@ void Session::device_free()
 	 */
 }
 
+int Session::get_max_closure_count()
+{
+	int max_closures = 0;
+	for(int i = 0; i < scene->shaders.size(); i++) {
+		int num_closures = scene->shaders[i]->graph->get_num_closures();
+		max_closures = max(max_closures, num_closures);
+	}
+	max_closure_global = max(max_closure_global, max_closures);
+	return max_closure_global;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 9da7a0aafa3..c669bccd34b 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SESSION_H__
@@ -32,6 +32,7 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
+class DeviceRequestedFeatures;
 class DisplayBuffer;
 class Progress;
 class RenderBuffers;
@@ -59,6 +60,7 @@ public:
 	double cancel_timeout;
 	double reset_timeout;
 	double text_timeout;
+	double progressive_update_timeout;
 
 	ShadingSystem shadingsystem;
 
@@ -80,6 +82,7 @@ public:
 		cancel_timeout = 0.1;
 		reset_timeout = 0.1;
 		text_timeout = 1.0;
+		progressive_update_timeout = 1.0;
 
 		shadingsystem = SHADINGSYSTEM_SVM;
 		tile_order = TILE_CENTER;
@@ -101,6 +104,7 @@ public:
 		&& cancel_timeout == params.cancel_timeout
 		&& reset_timeout == params.reset_timeout
 		&& text_timeout == params.text_timeout
+		&& progressive_update_timeout == params.progressive_update_timeout
 		&& tile_order == params.tile_order
 		&& shadingsystem == params.shadingsystem); }
 
@@ -122,8 +126,8 @@ public:
 	TileManager tile_manager;
 	Stats stats;
 
-	boost::function<void(RenderTile&)> write_render_tile_cb;
-	boost::function<void(RenderTile&)> update_render_tile_cb;
+	function<void(RenderTile&)> write_render_tile_cb;
+	function<void(RenderTile&)> update_render_tile_cb;
 
 	Session(const SessionParams& params);
 	~Session();
@@ -201,6 +205,16 @@ protected:
 	bool update_progressive_refine(bool cancel);
 
 	vector<RenderBuffers *> tile_buffers;
+
+	DeviceRequestedFeatures get_requested_device_features();
+
+	/* ** Split kernel routines ** */
+
+	/* Maximumnumber of closure during session lifetime. */
+	int max_closure_global;
+
+	/* Get maximum number of closures to be used in kernel. */
+	int get_max_closure_count();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index d76e511859a..aba3e7237d2 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -11,16 +11,18 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "background.h"
-#include "blackbody.h"
+#include "camera.h"
 #include "device.h"
 #include "graph.h"
+#include "integrator.h"
 #include "light.h"
 #include "mesh.h"
 #include "nodes.h"
+#include "object.h"
 #include "osl.h"
 #include "scene.h"
 #include "shader.h"
@@ -31,6 +33,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+vector<float> ShaderManager::beckmann_table;
+
 /* Beckmann sampling precomputed table, see bsdf_microfacet.h */
 
 /* 2D slope distribution (alpha = 1.0) */
@@ -83,7 +87,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 			}
 
 			/* CDF of P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
-			CDF_P22_omega_i[index_slope_x] = CDF_P22_omega_i[index_slope_x - 1] + P22_omega_i;
+			CDF_P22_omega_i[index_slope_x] = CDF_P22_omega_i[index_slope_x - 1] + (double)P22_omega_i;
 		}
 
 		/* renormalize CDF_P22_omega_i */
@@ -106,8 +110,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 
 			/* store value */
 			table[index_U + index_theta*BECKMANN_TABLE_SIZE] = (float)(
-				interp * slope_x[index_slope_x - 1]
-				+ (1.0f-interp) * slope_x[index_slope_x]);
+				interp * slope_x[index_slope_x - 1] +
+				    (1.0 - interp) * slope_x[index_slope_x]);
 		}
 	}
 }
@@ -138,17 +142,18 @@ Shader::Shader()
 	use_mis = true;
 	use_transparent_shadow = true;
 	heterogeneous_volume = true;
-	volume_sampling_method = 0;
+	volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	volume_interpolation_method = VOLUME_INTERPOLATION_LINEAR;
 
 	has_surface = false;
 	has_surface_transparent = false;
 	has_surface_emission = false;
 	has_surface_bssrdf = false;
-	has_converter_blackbody = false;
 	has_volume = false;
 	has_displacement = false;
 	has_bssrdf_bump = false;
 	has_heterogeneous_volume = false;
+	has_object_dependency = false;
 
 	used = false;
 
@@ -193,6 +198,7 @@ void Shader::tag_update(Scene *scene)
 	 * e.g. surface attributes when there is only a volume shader. this could
 	 * be more fine grained but it's better than nothing */
 	OutputNode *output = graph->output();
+	bool prev_has_volume = has_volume;
 	has_surface = has_surface || output->input("Surface")->link;
 	has_volume = has_volume || output->input("Volume")->link;
 	has_displacement = has_displacement || output->input("Displacement")->link;
@@ -214,6 +220,11 @@ void Shader::tag_update(Scene *scene)
 		need_update_attributes = true;
 		scene->mesh_manager->need_update = true;
 	}
+
+	if(has_volume != prev_has_volume) {
+		scene->mesh_manager->need_flags_update = true;
+		scene->object_manager->need_flags_update = true;
+	}
 }
 
 void Shader::tag_used(Scene *scene)
@@ -231,7 +242,6 @@ void Shader::tag_used(Scene *scene)
 ShaderManager::ShaderManager()
 {
 	need_update = true;
-	blackbody_table_offset = TABLE_OFFSET_INVALID;
 	beckmann_table_offset = TABLE_OFFSET_INVALID;
 }
 
@@ -243,6 +253,8 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 {
 	ShaderManager *manager;
 
+	(void)shadingsystem;  /* Ignored when built without OSL. */
+
 #ifdef WITH_OSL
 	if(shadingsystem == SHADINGSYSTEM_OSL)
 		manager = new OSLShaderManager();
@@ -312,7 +324,10 @@ void ShaderManager::device_update_shaders_used(Scene *scene)
 		scene->shaders[light->shader]->used = true;
 }
 
-void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void ShaderManager::device_update_common(Device *device,
+                                         DeviceScene *dscene,
+                                         Scene *scene,
+                                         Progress& /*progress*/)
 {
 	device->tex_free(dscene->shader_flag);
 	dscene->shader_flag.clear();
@@ -323,8 +338,8 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 	uint shader_flag_size = scene->shaders.size()*4;
 	uint *shader_flag = dscene->shader_flag.resize(shader_flag_size);
 	uint i = 0;
-	bool has_converter_blackbody = false;
 	bool has_volumes = false;
+	bool has_transparent_shadow = false;
 
 	foreach(Shader *shader, scene->shaders) {
 		uint flag = 0;
@@ -342,20 +357,22 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 				flag |= SD_HAS_ONLY_VOLUME;
 
 			/* todo: this could check more fine grained, to skip useless volumes
-			 * enclosed inside an opaque bsdf, although we still need to handle
-			 * the case with camera inside volumes too */
+			 * enclosed inside an opaque bsdf.
+			 */
 			flag |= SD_HAS_TRANSPARENT_SHADOW;
 		}
 		if(shader->heterogeneous_volume && shader->has_heterogeneous_volume)
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
-		if(shader->has_converter_blackbody)
-			has_converter_blackbody = true;
-		if(shader->volume_sampling_method == 1)
+		if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
 			flag |= SD_VOLUME_EQUIANGULAR;
-		if(shader->volume_sampling_method == 2)
+		if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
 			flag |= SD_VOLUME_MIS;
+		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
+			flag |= SD_VOLUME_CUBIC;
+		if(shader->graph_bump)
+			flag |= SD_HAS_BUMP;
 
 		/* regular shader */
 		shader_flag[i++] = flag;
@@ -367,45 +384,38 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 
 		shader_flag[i++] = flag;
 		shader_flag[i++] = shader->pass_id;
+
+		has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW);
 	}
 
 	device->tex_alloc("__shader_flag", dscene->shader_flag);
 
-	/* blackbody lookup table */
+	/* lookup tables */
 	KernelTables *ktables = &dscene->data.tables;
-	
-	if(has_converter_blackbody && blackbody_table_offset == TABLE_OFFSET_INVALID) {
-		vector<float> table = blackbody_table();
-		blackbody_table_offset = scene->lookup_tables->add_table(dscene, table);
-		
-		ktables->blackbody_offset = (int)blackbody_table_offset;
-	}
-	else if(!has_converter_blackbody && blackbody_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(blackbody_table_offset);
-		blackbody_table_offset = TABLE_OFFSET_INVALID;
-	}
 
 	/* beckmann lookup table */
 	if(beckmann_table_offset == TABLE_OFFSET_INVALID) {
-		vector<float> table;
-		beckmann_table_build(table);
-		beckmann_table_offset = scene->lookup_tables->add_table(dscene, table);
-		
+		if(beckmann_table.size() == 0) {
+			thread_scoped_lock lock(lookup_table_mutex);
+			if(beckmann_table.size() == 0) {
+				beckmann_table_build(beckmann_table);
+			}
+		}
+		beckmann_table_offset = scene->lookup_tables->add_table(dscene, beckmann_table);
 		ktables->beckmann_offset = (int)beckmann_table_offset;
 	}
 
 	/* integrator */
 	KernelIntegrator *kintegrator = &dscene->data.integrator;
 	kintegrator->use_volumes = has_volumes;
+	/* TODO(sergey): De-duplicate with flags set in integrator.cpp. */
+	if(scene->integrator->transparent_shadows) {
+		kintegrator->transparent_shadows = has_transparent_shadow;
+	}
 }
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
 {
-	if(blackbody_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(blackbody_table_offset);
-		blackbody_table_offset = TABLE_OFFSET_INVALID;
-	}
-
 	if(beckmann_table_offset != TABLE_OFFSET_INVALID) {
 		scene->lookup_tables->remove_table(beckmann_table_offset);
 		beckmann_table_offset = TABLE_OFFSET_INVALID;
@@ -479,5 +489,45 @@ void ShaderManager::add_default(Scene *scene)
 	}
 }
 
+/* NOTE: Expects max_group and features to be initialized in the callee. */
+void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
+                                                 int& max_group,
+                                                 int& features)
+{
+	foreach(ShaderNode *node, graph->nodes) {
+		max_group = max(max_group, node->get_group());
+		features |= node->get_feature();
+		if(node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
+			BsdfNode *bsdf_node = static_cast<BsdfNode*>(node);
+			if(CLOSURE_IS_VOLUME(bsdf_node->closure)) {
+				features |= NODE_FEATURE_VOLUME;
+			}
+		}
+	}
+}
+
+void ShaderManager::get_requested_features(Scene *scene,
+                                           int& max_group,
+                                           int& features)
+{
+	max_group = NODE_GROUP_LEVEL_0;
+	features = 0;
+	for(int i = 0; i < scene->shaders.size(); i++) {
+		Shader *shader = scene->shaders[i];
+		/* Gather requested features from all the nodes from the graph nodes. */
+		get_requested_graph_features(shader->graph, max_group, features);
+		/* Gather requested features from the graph itself. */
+		if(shader->graph_bump) {
+			get_requested_graph_features(shader->graph_bump,
+			                             max_group,
+			                             features);
+		}
+		ShaderNode *output_node = shader->graph->output();
+		if(output_node->input("Displacement")->link != NULL) {
+			features |= NODE_FEATURE_BUMP;
+		}
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 0ed6d2ddf01..64d45635ef1 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -11,24 +11,34 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SHADER_H__
 #define __SHADER_H__
 
+#ifdef WITH_OSL
+#  if defined(_MSC_VER)
+/* Prevent OSL from polluting the context with weird macros from windows.h.
+ * TODO(sergey): Ideally it's only enough to have class/struct declarations in
+ * the header and skip header include here.
+ */
+#    define NOGDI
+#    define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <OSL/oslexec.h>
+#endif
+
 #include "attribute.h"
 #include "kernel_types.h"
 
 #include "util_map.h"
 #include "util_param.h"
 #include "util_string.h"
+#include "util_thread.h"
 #include "util_types.h"
 
-#ifdef WITH_OSL
-#include <OSL/oslexec.h>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 class Device;
@@ -44,6 +54,18 @@ enum ShadingSystem {
 	SHADINGSYSTEM_SVM
 };
 
+/* Keep those in sync with the python-defined enum. */
+enum VolumeSampling {
+	VOLUME_SAMPLING_DISTANCE = 0,
+	VOLUME_SAMPLING_EQUIANGULAR = 1,
+	VOLUME_SAMPLING_MULTIPLE_IMPORTANCE = 2,
+};
+
+enum VolumeInterpolation {
+	VOLUME_INTERPOLATION_LINEAR = 0,
+	VOLUME_INTERPOLATION_CUBIC = 1,
+};
+
 /* Shader describing the appearance of a Mesh, Light or Background.
  *
  * While there is only a single shader graph, it has three outputs: surface,
@@ -68,7 +90,8 @@ public:
 	bool use_mis;
 	bool use_transparent_shadow;
 	bool heterogeneous_volume;
-	int volume_sampling_method;
+	VolumeSampling volume_sampling_method;
+	int volume_interpolation_method;
 
 	/* synchronization */
 	bool need_update;
@@ -81,9 +104,9 @@ public:
 	bool has_volume;
 	bool has_displacement;
 	bool has_surface_bssrdf;
-	bool has_converter_blackbody;
 	bool has_bssrdf_bump;
 	bool has_heterogeneous_volume;
+	bool has_object_dependency;
 
 	/* requested mesh attributes */
 	AttributeRequestSet attributes;
@@ -142,14 +165,25 @@ public:
 	 * have any shader assigned explicitly */
 	static void add_default(Scene *scene);
 
+	/* Selective nodes compilation. */
+	void get_requested_features(Scene *scene,
+	                            int& max_group,
+	                            int& features);
+
 protected:
 	ShaderManager();
 
 	typedef unordered_map<ustring, uint, ustringHash> AttributeIDMap;
 	AttributeIDMap unique_attribute_id;
 
-	size_t blackbody_table_offset;
+	thread_mutex lookup_table_mutex;
+	static vector<float> beckmann_table;
+
 	size_t beckmann_table_offset;
+
+	void get_requested_graph_features(ShaderGraph *graph,
+	                                  int& max_group,
+	                                  int& features);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/sky_model.cpp b/intern/cycles/render/sky_model.cpp
index adb07d9e288..c8a5dbe55e0 100644
--- a/intern/cycles/render/sky_model.cpp
+++ b/intern/cycles/render/sky_model.cpp
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -40,24 +40,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -81,7 +81,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -110,7 +110,7 @@ CCL_NAMESPACE_BEGIN
 //   Some macro definitions that occur elsewhere in ART, and that have to be
 //   replicated to make this a stand-alone module.
 
-#ifndef MATH_PI 
+#ifndef MATH_PI
 #define MATH_PI                     3.141592653589793
 #endif
 
@@ -138,250 +138,231 @@ typedef const double *ArHosekSkyModel_Radiance_Dataset;
 // internal functions
 
 static void ArHosekSkyModel_CookConfiguration(
-        ArHosekSkyModel_Dataset       dataset,
-        ArHosekSkyModelConfiguration  config, 
-        double                        turbidity, 
-        double                        albedo, 
-        double                        solar_elevation
-        )
+        ArHosekSkyModel_Dataset dataset,
+        ArHosekSkyModelConfiguration config,
+        double turbidity,
+        double albedo,
+        double solar_elevation)
 {
-    const double  * elev_matrix;
-
-    int     int_turbidity = (int)turbidity;
-    double  turbidity_rem = turbidity - (double)int_turbidity;
-
-    solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-    // alb 0 low turb
-
-    elev_matrix = dataset + ( 9 * 6 * (int_turbidity-1) );
-    
-    
-    for( unsigned int i = 0; i < 9; ++i )
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] = 
-        (1.0-albedo) * (1.0 - turbidity_rem) 
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	const double  * elev_matrix;
 
-    // alb 1 low turb
-    elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity-1));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (albedo) * (1.0 - turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	int int_turbidity = (int)turbidity;
+	double turbidity_rem = turbidity - (double)int_turbidity;
 
-    if(int_turbidity == 10)
-        return;
-
-    // alb 0 high turb
-    elev_matrix = dataset + (9*6*(int_turbidity));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (1.0-albedo) * (turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
+
+	// alb 0 low turb
 
-    // alb 1 high turb
-    elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (albedo) * (turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	elev_matrix = dataset + ( 9 * 6 * (int_turbidity-1));
+
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] =
+			(1.0-albedo) * (1.0 - turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
     }
+
+	// alb 1 low turb
+	elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity-1));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(albedo) * (1.0 - turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
+
+	if(int_turbidity == 10)
+		return;
+
+	// alb 0 high turb
+	elev_matrix = dataset + (9*6*(int_turbidity));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(1.0-albedo) * (turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
+
+	// alb 1 high turb
+	elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(albedo) * (turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
 }
 
 static double ArHosekSkyModel_CookRadianceConfiguration(
-        ArHosekSkyModel_Radiance_Dataset  dataset, 
-        double                            turbidity, 
-        double                            albedo, 
-        double                            solar_elevation
-        )
+        ArHosekSkyModel_Radiance_Dataset dataset,
+        double turbidity,
+        double albedo,
+        double solar_elevation)
 {
-    const double* elev_matrix;
-
-    int int_turbidity = (int)turbidity;
-    double turbidity_rem = turbidity - (double)int_turbidity;
-    double res;
-    solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-    // alb 0 low turb
-    elev_matrix = dataset + (6*(int_turbidity-1));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res = (1.0-albedo) * (1.0 - turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-    // alb 1 low turb
-    elev_matrix = dataset + (6*10 + 6*(int_turbidity-1));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (albedo) * (1.0 - turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-    if(int_turbidity == 10)
-        return res;
-
-    // alb 0 high turb
-    elev_matrix = dataset + (6*(int_turbidity));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (1.0-albedo) * (turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-    // alb 1 high turb
-    elev_matrix = dataset + (6*10 + 6*(int_turbidity));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (albedo) * (turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-    return res;
+	const double* elev_matrix;
+
+	int int_turbidity = (int)turbidity;
+	double turbidity_rem = turbidity - (double)int_turbidity;
+	double res;
+	solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
+
+	// alb 0 low turb
+	elev_matrix = dataset + (6*(int_turbidity-1));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res = (1.0-albedo) * (1.0 - turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+
+	// alb 1 low turb
+	elev_matrix = dataset + (6*10 + 6*(int_turbidity-1));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (albedo) * (1.0 - turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+	if(int_turbidity == 10)
+		return res;
+
+	// alb 0 high turb
+	elev_matrix = dataset + (6*(int_turbidity));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (1.0-albedo) * (turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+
+	// alb 1 high turb
+	elev_matrix = dataset + (6*10 + 6*(int_turbidity));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (albedo) * (turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+	return res;
 }
 
 static double ArHosekSkyModel_GetRadianceInternal(
-        ArHosekSkyModelConfiguration  configuration, 
-        double                        theta, 
-        double                        gamma
-        )
+        ArHosekSkyModelConfiguration configuration,
+        double theta,
+        double gamma)
 {
-    const double expM = exp(configuration[4] * gamma);
-    const double rayM = cos(gamma)*cos(gamma);
-    const double mieM = (1.0 + cos(gamma)*cos(gamma)) / pow((1.0 + configuration[8]*configuration[8] - 2.0*configuration[8]*cos(gamma)), 1.5);
-    const double zenith = sqrt(cos(theta));
+	const double expM = exp(configuration[4] * gamma);
+	const double rayM = cos(gamma)*cos(gamma);
+	const double mieM = (1.0 + cos(gamma)*cos(gamma)) / pow((1.0 + configuration[8]*configuration[8] - 2.0*configuration[8]*cos(gamma)), 1.5);
+	const double zenith = sqrt(cos(theta));
 
-    return (1.0 + configuration[0] * exp(configuration[1] / (cos(theta) + 0.01))) *
+	return (1.0 + configuration[0] * exp(configuration[1] / (cos(theta) + 0.01))) *
             (configuration[2] + configuration[3] * expM + configuration[5] * rayM + configuration[6] * mieM + configuration[7] * zenith);
 }
 
-void arhosekskymodelstate_free(
-        ArHosekSkyModelState  * state
-        )
+void arhosekskymodelstate_free(ArHosekSkyModelState  * state)
 {
-    free(state);
+	free(state);
 }
 
-double arhosekskymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta, 
-        double                  gamma, 
-        double                  wavelength
-        )
+double arhosekskymodel_radiance(ArHosekSkyModelState  *state,
+                                double theta,
+                                double gamma,
+                                double wavelength)
 {
-    int low_wl = (int)((wavelength - 320.0) / 40.0);
-
-    if ( low_wl < 0 || low_wl >= 11 )
-        return 0.0f;
-
-    double interp = fmod((wavelength - 320.0 ) / 40.0, 1.0);
-
-    double val_low = 
-          ArHosekSkyModel_GetRadianceInternal(
-                state->configs[low_wl],
-                theta,
-                gamma
-              )
-        * state->radiances[low_wl]
-        * state->emission_correction_factor_sky[low_wl];
-
-    if ( interp < 1e-6 )
-        return val_low;
-
-    double result = ( 1.0 - interp ) * val_low;
-
-    if ( low_wl+1 < 11 )
-    {
-        result +=
-              interp
-            * ArHosekSkyModel_GetRadianceInternal(
-                    state->configs[low_wl+1],
-                    theta,
-                    gamma
-                  )
-            * state->radiances[low_wl+1]
-            * state->emission_correction_factor_sky[low_wl+1];
-    }
-
-    return result;
+	int low_wl = (int)((wavelength - 320.0) / 40.0);
+
+	if(low_wl < 0 || low_wl >= 11)
+	    return 0.0f;
+
+	double interp = fmod((wavelength - 320.0 ) / 40.0, 1.0);
+
+	double val_low =
+		ArHosekSkyModel_GetRadianceInternal(
+		        state->configs[low_wl],
+		        theta,
+		        gamma)
+		* state->radiances[low_wl]
+		* state->emission_correction_factor_sky[low_wl];
+
+	if(interp < 1e-6)
+		return val_low;
+
+	double result = ( 1.0 - interp ) * val_low;
+
+    if(low_wl+1 < 11) {
+	    result +=
+		    interp
+		    * ArHosekSkyModel_GetRadianceInternal(
+		            state->configs[low_wl+1],
+		            theta,
+		            gamma)
+		    * state->radiances[low_wl+1]
+		    * state->emission_correction_factor_sky[low_wl+1];
+	}
+
+	return result;
 }
 
 
 // xyz and rgb versions
 
-ArHosekSkyModelState  * arhosek_xyz_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        )
+ArHosekSkyModelState * arhosek_xyz_skymodelstate_alloc_init(
+        const double turbidity,
+        const double albedo,
+        const double elevation)
 {
-    ArHosekSkyModelState  * state = ALLOC(ArHosekSkyModelState);
-
-    state->solar_radius = TERRESTRIAL_SOLAR_RADIUS;
-    state->turbidity    = turbidity;
-    state->albedo       = albedo;
-    state->elevation    = elevation;
-    
-    for( unsigned int channel = 0; channel < 3; ++channel )
-    {
-        ArHosekSkyModel_CookConfiguration(
-            datasetsXYZ[channel], 
-            state->configs[channel], 
-            turbidity, 
-            albedo, 
-            elevation
-            );
-        
-        state->radiances[channel] = 
-        ArHosekSkyModel_CookRadianceConfiguration(
-            datasetsXYZRad[channel],
-            turbidity, 
-            albedo,
-            elevation
-            );
+	ArHosekSkyModelState  * state = ALLOC(ArHosekSkyModelState);
+
+	state->solar_radius = TERRESTRIAL_SOLAR_RADIUS;
+	state->turbidity = turbidity;
+	state->albedo = albedo;
+	state->elevation = elevation;
+
+    for(unsigned int channel = 0; channel < 3; ++channel) {
+		ArHosekSkyModel_CookConfiguration(
+		    datasetsXYZ[channel],
+		    state->configs[channel],
+		    turbidity,
+		    albedo,
+		    elevation);
+
+		state->radiances[channel] =
+		ArHosekSkyModel_CookRadianceConfiguration(
+		    datasetsXYZRad[channel],
+		    turbidity,
+		    albedo,
+		    elevation);
     }
-    
+
     return state;
 }
 
diff --git a/intern/cycles/render/sky_model.h b/intern/cycles/render/sky_model.h
index 3814543c8b6..237e4e61bf5 100644
--- a/intern/cycles/render/sky_model.h
+++ b/intern/cycles/render/sky_model.h
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -41,24 +41,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -82,7 +82,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -96,9 +96,9 @@ an updated version of this code has been published!
 /*
 
 This code is taken from ART, a rendering research system written in a
-mix of C99 / Objective C. Since ART is not a small system and is intended to 
-be inter-operable with other libraries, and since C does not have namespaces, 
-the structures and functions in ART all have to have somewhat wordy 
+mix of C99 / Objective C. Since ART is not a small system and is intended to
+be inter-operable with other libraries, and since C does not have namespaces,
+the structures and functions in ART all have to have somewhat wordy
 canonical names that begin with Ar.../ar..., like those seen in this example.
 
 Usage information:
@@ -119,7 +119,7 @@ snippet, we assume that 'albedo' is defined as
 
     double  albedo[num_channels];
 
-with a ground albedo value between [0,1] for each channel. The solar elevation  
+with a ground albedo value between [0,1] for each channel. The solar elevation
 is given in radians.
 
     for ( unsigned int i = 0; i < num_channels; i++ )
@@ -130,11 +130,11 @@ is given in radians.
                   solarElevation
                 );
 
-Note that starting with version 1.3, there is also a second initialisation 
-function which generates skydome states for different solar emission spectra 
+Note that starting with version 1.3, there is also a second initialisation
+function which generates skydome states for different solar emission spectra
 and solar radii: 'arhosekskymodelstate_alienworld_alloc_init()'.
 
-See the notes about the "Alien World" functionality provided further down for a 
+See the notes about the "Alien World" functionality provided further down for a
 discussion of the usefulness and limits of that second initalisation function.
 Sky model states that have been initialized with either function behave in a
 completely identical fashion during use and cleanup.
@@ -155,7 +155,7 @@ on the skydome determined via the angles theta and gamma works as follows:
                 gamma,
                 channel_center[i]
               );
-              
+
 The variable "channel_center" is assumed to hold the channel center wavelengths
 for each of the num_channels samples of the spectrum we are building.
 
@@ -188,114 +188,114 @@ by calling arhosek_rgb_skymodelstate_alloc_init.
 Solar Radiance Function
 -----------------------
 
-For each position on the solar disc, this function returns the entire radiance 
-one sees - direct emission, as well as in-scattered light in the area of the 
-solar disc. The latter is important for low solar elevations - nice images of 
-the setting sun would not be possible without this. This is also the reason why 
-this function, just like the regular sky dome model evaluation function, needs 
-access to the sky dome data structures, as these provide information on 
+For each position on the solar disc, this function returns the entire radiance
+one sees - direct emission, as well as in-scattered light in the area of the
+solar disc. The latter is important for low solar elevations - nice images of
+the setting sun would not be possible without this. This is also the reason why
+this function, just like the regular sky dome model evaluation function, needs
+access to the sky dome data structures, as these provide information on
 in-scattered radiance.
 
 CAVEAT #1: in this release, this function is only provided in spectral form!
            RGB/XYZ versions to follow at a later date.
 
-CAVEAT #2: (fixed from release 1.3 onwards) 
+CAVEAT #2: (fixed from release 1.3 onwards)
 
 CAVEAT #3: limb darkening renders the brightness of the solar disc
            inhomogeneous even for high solar elevations - only taking a single
            sample at the centre of the sun will yield an incorrect power
            estimate for the solar disc! Always take multiple random samples
            across the entire solar disc to estimate its power!
-           
+
 CAVEAT #4: in this version, the limb darkening calculations still use a fairly
-           computationally expensive 5th order polynomial that was directly 
+           computationally expensive 5th order polynomial that was directly
            taken from astronomical literature. For the purposes of Computer
-           Graphics, this is needlessly accurate, though, and will be replaced 
+           Graphics, this is needlessly accurate, though, and will be replaced
            by a cheaper approximation in a future release.
 
 "Alien World" functionality
 ---------------------------
 
-The Hosek sky model can be used to roughly (!) predict the appearance of 
-outdoor scenes on earth-like planets, i.e. planets of a similar size and 
-atmospheric make-up. Since the spectral version of our model predicts sky dome 
-luminance patterns and solar radiance independently for each waveband, and 
-since the intensity of each waveband is solely dependent on the input radiance 
-from the star that the world in question is orbiting, it is trivial to re-scale 
+The Hosek sky model can be used to roughly (!) predict the appearance of
+outdoor scenes on earth-like planets, i.e. planets of a similar size and
+atmospheric make-up. Since the spectral version of our model predicts sky dome
+luminance patterns and solar radiance independently for each waveband, and
+since the intensity of each waveband is solely dependent on the input radiance
+from the star that the world in question is orbiting, it is trivial to re-scale
 the wavebands to match a different star radiance.
 
-At least in theory, the spectral version of the model has always been capable 
-of this sort of thing, and the actual sky dome and solar radiance models were 
+At least in theory, the spectral version of the model has always been capable
+of this sort of thing, and the actual sky dome and solar radiance models were
 actually not altered at all in this release. All we did was to add some support
-functionality for doing this more easily with the existing data and functions, 
+functionality for doing this more easily with the existing data and functions,
 and to add some explanations.
 
 Just use 'arhosekskymodelstate_alienworld_alloc_init()' to initialise the sky
-model states (you will have to provide values for star temperature and solar 
-intensity compared to the terrestrial sun), and do everything else as you 
+model states (you will have to provide values for star temperature and solar
+intensity compared to the terrestrial sun), and do everything else as you
 did before.
 
-CAVEAT #1: we assume the emission of the star that illuminates the alien world 
-           to be a perfect blackbody emission spectrum. This is never entirely 
-           realistic - real star emission spectra are considerably more complex 
-           than this, mainly due to absorption effects in the outer layers of 
-           stars. However, blackbody spectra are a reasonable first assumption 
-           in a usage scenario like this, where 100% accuracy is simply not 
-           necessary: for rendering purposes, there are likely no visible 
-           differences between a highly accurate solution based on a more 
+CAVEAT #1: we assume the emission of the star that illuminates the alien world
+           to be a perfect blackbody emission spectrum. This is never entirely
+           realistic - real star emission spectra are considerably more complex
+           than this, mainly due to absorption effects in the outer layers of
+           stars. However, blackbody spectra are a reasonable first assumption
+           in a usage scenario like this, where 100% accuracy is simply not
+           necessary: for rendering purposes, there are likely no visible
+           differences between a highly accurate solution based on a more
            involved simulation, and this approximation.
 
 CAVEAT #2: we always use limb darkening data from our own sun to provide this
-           "appearance feature", even for suns of strongly different 
-           temperature. Which is presumably not very realistic, but (as with 
-           the unaltered blackbody spectrum from caveat #1) probably not a bad 
+           "appearance feature", even for suns of strongly different
+           temperature. Which is presumably not very realistic, but (as with
+           the unaltered blackbody spectrum from caveat #1) probably not a bad
            first guess, either. If you need more accuracy than we provide here,
            please make inquiries with a friendly astro-physicst of your choice.
 
-CAVEAT #3: you have to provide a value for the solar intensity of the star 
-           which illuminates the alien world. For this, please bear in mind  
-           that there is very likely a comparatively tight range of absolute  
-           solar irradiance values for which an earth-like planet with an  
-           atmosphere like the one we assume in our model can exist in the  
+CAVEAT #3: you have to provide a value for the solar intensity of the star
+           which illuminates the alien world. For this, please bear in mind
+           that there is very likely a comparatively tight range of absolute
+           solar irradiance values for which an earth-like planet with an
+           atmosphere like the one we assume in our model can exist in the
            first place!
-            
-           Too much irradiance, and the atmosphere probably boils off into 
-           space, too little, it freezes. Which means that stars of 
-           considerably different emission colour than our sun will have to be 
-           fairly different in size from it, to still provide a reasonable and 
-           inhabitable amount of irradiance. Red stars will need to be much 
-           larger than our sun, while white or blue stars will have to be 
-           comparatively tiny. The initialisation function handles this and 
+
+           Too much irradiance, and the atmosphere probably boils off into
+           space, too little, it freezes. Which means that stars of
+           considerably different emission colour than our sun will have to be
+           fairly different in size from it, to still provide a reasonable and
+           inhabitable amount of irradiance. Red stars will need to be much
+           larger than our sun, while white or blue stars will have to be
+           comparatively tiny. The initialisation function handles this and
            computes a plausible solar radius for a given emission spectrum. In
            terms of absolute radiometric values, you should probably not stray
            all too far from a solar intensity value of 1.0.
 
-CAVEAT #4: although we now support different solar radii for the actual solar 
-           disc, the sky dome luminance patterns are *not* parameterised by 
-           this value - i.e. the patterns stay exactly the same for different 
-           solar radii! Which is of course not correct. But in our experience, 
-           solar discs up to several degrees in diameter (! - our own sun is 
-           half a degree across) do not cause the luminance patterns on the sky 
-           to change perceptibly. The reason we know this is that we initially 
-           used unrealistically large suns in our brute force path tracer, in 
-           order to improve convergence speeds (which in the beginning were 
-           abysmal). Later, we managed to do the reference renderings much 
-           faster even with realistically small suns, and found that there was 
-           no real difference in skydome appearance anyway. 
-           Conclusion: changing the solar radius should not be over-done, so  
-           close orbits around red supergiants are a no-no. But for the  
-           purposes of getting a fairly credible first impression of what an 
-           alien world with a reasonably sized sun would look like, what we are  
+CAVEAT #4: although we now support different solar radii for the actual solar
+           disc, the sky dome luminance patterns are *not* parameterised by
+           this value - i.e. the patterns stay exactly the same for different
+           solar radii! Which is of course not correct. But in our experience,
+           solar discs up to several degrees in diameter (! - our own sun is
+           half a degree across) do not cause the luminance patterns on the sky
+           to change perceptibly. The reason we know this is that we initially
+           used unrealistically large suns in our brute force path tracer, in
+           order to improve convergence speeds (which in the beginning were
+           abysmal). Later, we managed to do the reference renderings much
+           faster even with realistically small suns, and found that there was
+           no real difference in skydome appearance anyway.
+           Conclusion: changing the solar radius should not be over-done, so
+           close orbits around red supergiants are a no-no. But for the
+           purposes of getting a fairly credible first impression of what an
+           alien world with a reasonably sized sun would look like, what we are
            doing here is probably still o.k.
 
-HINT #1:   if you want to model the sky of an earth-like planet that orbits 
-           a binary star, just super-impose two of these models with solar 
+HINT #1:   if you want to model the sky of an earth-like planet that orbits
+           a binary star, just super-impose two of these models with solar
            intensity of ~0.5 each, and closely spaced solar positions. Light is
            additive, after all. Tattooine, here we come... :-)
 
            P.S. according to Star Wars canon, Tattooine orbits a binary
-           that is made up of a G and K class star, respectively. 
-           So ~5500K and ~4200K should be good first guesses for their 
+           that is made up of a G and K class star, respectively.
+           So ~5500K and ~4200K should be good first guesses for their
            temperature. Just in case you were wondering, after reading the
            previous paragraph.
 */
@@ -316,37 +316,37 @@ typedef double ArHosekSkyModelConfiguration[9];
     ---------------------------
 
     This struct holds the pre-computation data for one particular albedo value.
-    Most fields are self-explanatory, but users should never directly 
-    manipulate any of them anyway. The only consistent way to manipulate such 
-    structs is via the functions 'arhosekskymodelstate_alloc_init' and 
+    Most fields are self-explanatory, but users should never directly
+    manipulate any of them anyway. The only consistent way to manipulate such
+    structs is via the functions 'arhosekskymodelstate_alloc_init' and
     'arhosekskymodelstate_free'.
-    
+
     'emission_correction_factor_sky'
     'emission_correction_factor_sun'
 
-        The original model coefficients were fitted against the emission of 
+        The original model coefficients were fitted against the emission of
         our local sun. If a different solar emission is desired (i.e. if the
-        model is being used to predict skydome appearance for an earth-like 
-        planet that orbits a different star), these correction factors, which 
-        are determined during the alloc_init step, are applied to each waveband 
-        separately (they default to 1.0 in normal usage). This is the simplest 
-        way to retrofit this sort of capability to the existing model. The 
-        different factors for sky and sun are needed since the solar disc may 
+        model is being used to predict skydome appearance for an earth-like
+        planet that orbits a different star), these correction factors, which
+        are determined during the alloc_init step, are applied to each waveband
+        separately (they default to 1.0 in normal usage). This is the simplest
+        way to retrofit this sort of capability to the existing model. The
+        different factors for sky and sun are needed since the solar disc may
         be of a different size compared to the terrestrial sun.
 
 ---------------------------------------------------------------------------- */
 
 typedef struct ArHosekSkyModelState
 {
-    ArHosekSkyModelConfiguration  configs[11];
-    double                        radiances[11];
-    double                        turbidity;
-    double                        solar_radius;
-    double                        emission_correction_factor_sky[11];
-    double                        emission_correction_factor_sun[11];
-    double                        albedo;
-    double                        elevation;
-} 
+	ArHosekSkyModelConfiguration  configs[11];
+	double                        radiances[11];
+	double                        turbidity;
+	double                        solar_radius;
+	double                        emission_correction_factor_sky[11];
+	double                        emission_correction_factor_sun[11];
+	double                        albedo;
+	double                        elevation;
+}
 ArHosekSkyModelState;
 
 /* ----------------------------------------------------------------------------
@@ -358,11 +358,10 @@ ArHosekSkyModelState;
 
 ---------------------------------------------------------------------------- */
 
-ArHosekSkyModelState  * arhosekskymodelstate_alloc_init(
-        const double  solar_elevation,
-        const double  atmospheric_turbidity,
-        const double  ground_albedo
-        );
+ArHosekSkyModelState *arhosekskymodelstate_alloc_init(
+        const double solar_elevation,
+        const double atmospheric_turbidity,
+        const double ground_albedo);
 
 
 /* ----------------------------------------------------------------------------
@@ -375,78 +374,67 @@ ArHosekSkyModelState  * arhosekskymodelstate_alloc_init(
     'solar_intensity' controls the overall brightness of the sky, relative
     to the solar irradiance on Earth. A value of 1.0 yields a sky dome that
     is, on average over the wavelenghts covered in the model (!), as bright
-    as the terrestrial sky in radiometric terms. 
-    
-    Which means that the solar radius has to be adjusted, since the 
-    emissivity of a solar surface with a given temperature is more or less 
-    fixed. So hotter suns have to be smaller to be equally bright as the 
+    as the terrestrial sky in radiometric terms.
+
+    Which means that the solar radius has to be adjusted, since the
+    emissivity of a solar surface with a given temperature is more or less
+    fixed. So hotter suns have to be smaller to be equally bright as the
     terrestrial sun, while cooler suns have to be larger. Note that there are
     limits to the validity of the luminance patterns of the underlying model:
     see the discussion above for more on this. In particular, an alien sun with
     a surface temperature of only 2000 Kelvin has to be very large if it is
-    to be as bright as the terrestrial sun - so large that the luminance 
+    to be as bright as the terrestrial sun - so large that the luminance
     patterns are no longer a really good fit in that case.
-    
+
     If you need information about the solar radius that the model computes
-    for a given temperature (say, for light source sampling purposes), you 
-    have to query the 'solar_radius' variable of the sky model state returned 
+    for a given temperature (say, for light source sampling purposes), you
+    have to query the 'solar_radius' variable of the sky model state returned
     *after* running this function.
 
 ---------------------------------------------------------------------------- */
 
-ArHosekSkyModelState  * arhosekskymodelstate_alienworld_alloc_init(
-        const double  solar_elevation,
-        const double  solar_intensity,
-        const double  solar_surface_temperature_kelvin,
-        const double  atmospheric_turbidity,
-        const double  ground_albedo
-        );
-
-void arhosekskymodelstate_free(
-        ArHosekSkyModelState  * state
-        );
-
-double arhosekskymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta, 
-        double                  gamma, 
-        double                  wavelength
-        );
+ArHosekSkyModelState* arhosekskymodelstate_alienworld_alloc_init(
+        const double solar_elevation,
+        const double solar_intensity,
+        const double solar_surface_temperature_kelvin,
+        const double atmospheric_turbidity,
+        const double ground_albedo);
+
+void arhosekskymodelstate_free(ArHosekSkyModelState  *state);
+
+double arhosekskymodel_radiance(ArHosekSkyModelState *state,
+                                double theta,
+                                double gamma,
+                                double wavelength);
 
 // CIE XYZ and RGB versions
 
 
 ArHosekSkyModelState  * arhosek_xyz_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        );
+        const double turbidity,
+        const double albedo,
+        const double elevation);
 
 
 ArHosekSkyModelState  * arhosek_rgb_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        );
+        const double turbidity,
+        const double albedo,
+        const double elevation);
 
 
-double arhosek_tristim_skymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta,
-        double                  gamma, 
-        int                     channel
-        );
+double arhosek_tristim_skymodel_radiance(ArHosekSkyModelState* state,
+                                         double theta,
+                                         double gamma,
+                                         int channel);
 
 //   Delivers the complete function: sky + sun, including limb darkening.
 //   Please read the above description before using this - there are several
 //   caveats!
 
-double arhosekskymodel_solar_radiance(
-        ArHosekSkyModelState      * state,
-        double                      theta,
-        double                      gamma,
-        double                      wavelength
-        );
+double arhosekskymodel_solar_radiance(ArHosekSkyModelState* state,
+                                      double theta,
+                                      double gamma,
+                                      double wavelength);
 
 
 #endif // _SKY_MODEL_H_
diff --git a/intern/cycles/render/sky_model_data.h b/intern/cycles/render/sky_model_data.h
index 4171bd12756..e6f3f761532 100644
--- a/intern/cycles/render/sky_model_data.h
+++ b/intern/cycles/render/sky_model_data.h
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -41,24 +41,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -82,7 +82,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -96,15 +96,14 @@ CCL_NAMESPACE_BEGIN
 
 /*
 
-This file contains the coefficient data for the XYZ colour space version of 
+This file contains the coefficient data for the XYZ colour space version of
 the model.
 
 */
 
 // Uses Sep 9 pattern / Aug 23 mean dataset
 
-static const double datasetXYZ1[] =
-{
+static const double datasetXYZ1[] = {
 	// albedo 0, turbidity 1
 	-1.117001e+000,
 	-1.867262e-001,
@@ -3849,15 +3848,13 @@ static const double datasetXYZRad3[] =
 
 
 
-static const double* datasetsXYZ[] =
-{
+static const double* datasetsXYZ[] = {
 	datasetXYZ1,
 	datasetXYZ2,
 	datasetXYZ3
 };
 
-static const double* datasetsXYZRad[] =
-{
+static const double* datasetsXYZRad[] = {
 	datasetXYZRad1,
 	datasetXYZRad2,
 	datasetXYZRad3
diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h
index b5eaa67db3e..574f148b9a2 100644
--- a/intern/cycles/render/sobol.h
+++ b/intern/cycles/render/sobol.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SOBOL_H__
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 204c8fa2ce7..2dfa9fc98cc 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -26,6 +26,7 @@
 #include "svm.h"
 
 #include "util_debug.h"
+#include "util_logging.h"
 #include "util_foreach.h"
 #include "util_progress.h"
 
@@ -41,12 +42,14 @@ SVMShaderManager::~SVMShaderManager()
 {
 }
 
-void SVMShaderManager::reset(Scene *scene)
+void SVMShaderManager::reset(Scene * /*scene*/)
 {
 }
 
 void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	if(!need_update)
 		return;
 
@@ -367,7 +370,7 @@ uint SVMCompiler::attribute(AttributeStandard std)
 	return shader_manager->get_attribute_id(std);
 }
 
-bool SVMCompiler::node_skip_input(ShaderNode *node, ShaderInput *input)
+bool SVMCompiler::node_skip_input(ShaderNode * /*node*/, ShaderInput *input)
 {
 	/* nasty exception .. */
 	if(current_type == SHADER_TYPE_DISPLACEMENT && input->link && input->link->parent->name == ustring("bump"))
@@ -403,9 +406,9 @@ void SVMCompiler::generate_node(ShaderNode *node, set<ShaderNode*>& done)
 			current_shader->has_heterogeneous_volume = true;
 	}
 
-	/* detect if we have a blackbody converter, to prepare lookup table */
-	if(node->has_converter_blackbody())
-		current_shader->has_converter_blackbody = true;
+	if(node->has_object_dependency()) {
+		current_shader->has_object_dependency = true;
+	}
 }
 
 void SVMCompiler::generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNode*>& done)
@@ -752,10 +755,10 @@ void SVMCompiler::compile(Shader *shader, vector<int4>& global_svm_nodes, int in
 	shader->has_surface_transparent = false;
 	shader->has_surface_bssrdf = false;
 	shader->has_bssrdf_bump = false;
-	shader->has_converter_blackbody = false;
 	shader->has_volume = false;
 	shader->has_displacement = false;
 	shader->has_heterogeneous_volume = false;
+	shader->has_object_dependency = false;
 
 	/* generate surface shader */
 	compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index 4a666bade55..239e9781cef 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_H__
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index a8d502c432d..ad3f4866072 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "device.h"
@@ -19,6 +19,7 @@
 #include "tables.h"
 
 #include "util_debug.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -36,6 +37,8 @@ LookupTables::~LookupTables()
 
 void LookupTables::device_update(Device *device, DeviceScene *dscene)
 {
+	VLOG(1) << "Total " << lookup_tables.size() << " lookup tables.";
+
 	if(!need_update)
 		return;
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 059940cbeb6..4efa09fa3c1 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __TABLES_H__
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index d6094a4fa0a..7e68ce84d94 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "tile.h"
@@ -28,6 +28,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
 	tile_size = tile_size_;
 	tile_order = tile_order_;
 	start_resolution = start_resolution_;
+	num_samples = num_samples_;
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
 	background = background_;
@@ -200,8 +201,8 @@ list<Tile>::iterator TileManager::next_background_tile(int device, TileOrder til
 			
 			switch (tile_order) {
 				case TILE_CENTER:
-					distx = centx - (cur_tile.x + cur_tile.w);
-					disty = centy - (cur_tile.y + cur_tile.h);
+					distx = centx - (cur_tile.x + (cur_tile.w / 2));
+					disty = centy - (cur_tile.y + (cur_tile.h / 2));
 					distx = (int64_t)sqrt((double)(distx * distx + disty * disty));
 					break;
 				case TILE_RIGHT_TO_LEFT:
@@ -234,7 +235,7 @@ bool TileManager::next_tile(Tile& tile, int device)
 {
 	list<Tile>::iterator tile_it;
 	
-	if (background)
+	if(background)
 		tile_it = next_background_tile(device, tile_order);
 	else
 		tile_it = next_viewport_tile(device);
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 7796518b6bc..c9bdc86f029 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __TILE_H__
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index 05ff5ca4b65..44bab066dde 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
@@ -117,8 +117,8 @@ void EdgeDice::stitch_triangles(Patch *patch, vector<int>& outer, vector<int>& i
 		}
 		else {
 			/* length of diagonals */
-			float len1 = len(mesh_P[inner[i]] - mesh_P[outer[j+1]]);
-			float len2 = len(mesh_P[outer[j]] - mesh_P[inner[i+1]]);
+			float len1 = len_squared(mesh_P[inner[i]] - mesh_P[outer[j+1]]);
+			float len2 = len_squared(mesh_P[outer[j]] - mesh_P[inner[i+1]]);
 
 			/* use smallest diagonal */
 			if(len1 < len2)
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 9cf5b0d50b8..b7e61748779 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SUBD_DICE_H__
diff --git a/intern/cycles/subd/subd_mesh.cpp b/intern/cycles/subd/subd_mesh.cpp
index 0db20656f39..17a730e5efe 100644
--- a/intern/cycles/subd/subd_mesh.cpp
+++ b/intern/cycles/subd/subd_mesh.cpp
@@ -109,8 +109,8 @@ public:
 		evalctrl.EvalLimitSample<OsdCpuVertexBuffer,OsdCpuVertexBuffer>(coords, evalctx, 0);
 
 		*P_ = make_float3(P[0], P[1], P[2]);
-		if (dPdu_) *dPdu_ = make_float3(dPdv[0], dPdv[1], dPdv[2]);
-		if (dPdv_) *dPdv_ = make_float3(dPdu[0], dPdu[1], dPdu[2]);
+		if(dPdu_) *dPdu_ = make_float3(dPdv[0], dPdv[1], dPdv[2]);
+		if(dPdv_) *dPdv_ = make_float3(dPdu[0], dPdu[1], dPdu[2]);
 
 		/* optimize: skip evaluating derivatives when not needed */
 		/* todo: swapped derivatives, different winding convention? */
@@ -234,7 +234,7 @@ bool OpenSubdMesh::finish()
 
 void OpenSubdMesh::tessellate(DiagSplit *split)
 {
-	if (num_ptex_faces == 0)
+	if(num_ptex_faces == 0)
 		return;
 
 	const int level = 3;
diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp
index fe9fa791813..0db46ec492d 100644
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Parts adapted from code in the public domain in NVidia Mesh Tools. */
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 48f35d78711..9be4606c248 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SUBD_PATCH_H__
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 6bbf4af3f85..df4d451e8eb 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "camera.h"
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index 3f9a2721977..df4935ee624 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SUBD_SPLIT_H__
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 6120e7e8456..0acb9e9304c 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -1,16 +1,16 @@
 
 set(INC
 	.
+	../../glew-mx
 )
 
 set(INC_SYS
-	${GLEW_INCLUDE_PATH}
-	${OPENGL_INCLUDE_DIR}
+	${GLEW_INCLUDE_DIR}
 )
 
 set(SRC
+	util_aligned_malloc.cpp
 	util_cache.cpp
-	util_dynlib.cpp
 	util_logging.cpp
 	util_md5.cpp
 	util_path.cpp
@@ -22,6 +22,10 @@ set(SRC
 	util_transform.cpp
 )
 
+if(NOT CYCLES_STANDALONE_REPOSITORY)
+	add_definitions(-DWITH_GLEW_MX)
+endif()
+
 if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
 	list(APPEND SRC
 		util_view.cpp
@@ -30,11 +34,12 @@ endif()
 
 set(SRC_HEADERS
 	util_algorithm.h
+	util_aligned_malloc.h
 	util_args.h
+	util_atomic.h
 	util_boundbox.h
 	util_cache.h
 	util_debug.h
-	util_dynlib.h
 	util_foreach.h
 	util_function.h
 	util_half.h
@@ -44,6 +49,7 @@ set(SRC_HEADERS
 	util_logging.h
 	util_map.h
 	util_math.h
+	util_math_fast.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
@@ -68,7 +74,18 @@ set(SRC_HEADERS
 	util_xml.h
 )
 
+if(WITH_CYCLES_DEBUG)
+	list(APPEND SRC
+		util_guarded_allocator.cpp
+	)
+	list(APPEND SRC_HEADERS
+		util_guarded_allocator.h
+	)
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+add_definitions(${GL_DEFINITIONS})
+
 add_library(cycles_util ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/util/util_algorithm.h b/intern/cycles/util/util_algorithm.h
index 5865f3f04bb..5c79c00cc98 100644
--- a/intern/cycles/util/util_algorithm.h
+++ b/intern/cycles/util/util_algorithm.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_ALGORITHM_H__
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
new file mode 100644
index 00000000000..b161a55c15e
--- /dev/null
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util_aligned_malloc.h"
+#include "util_guarded_allocator.h"
+
+#include <cassert>
+
+/* Adopted from Libmv. */
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__)
+/* Needed for memalign on Linux and _aligned_alloc on Windows. */
+#  ifdef FREE_WINDOWS
+/* Make sure _aligned_malloc is included. */
+#    ifdef __MSVCRT_VERSION__
+#      undef __MSVCRT_VERSION__
+#    endif
+#    define __MSVCRT_VERSION__ 0x0700
+#  endif  /* FREE_WINDOWS */
+#  include <malloc.h>
+#else
+/* Apple's malloc is 16-byte aligned, and does not have malloc.h, so include
+ * stdilb instead.
+ */
+#  include <cstdlib>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+void *util_aligned_malloc(size_t size, int alignment)
+{
+#ifdef WITH_BLENDER_GUARDEDALLOC
+	return MEM_mallocN_aligned(size, alignment, "Cycles Aligned Alloc");
+#endif
+#ifdef _WIN32
+	return _aligned_malloc(size, alignment);
+#elif defined(__APPLE__)
+	/* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so
+	 * they work natively with SSE types with no further work.
+	 */
+	assert(alignment == 16);
+	return malloc(size);
+#elif defined(__FreeBSD__) || defined(__NetBSD__)
+	void *result;
+	if(posix_memalign(&result, alignment, size)) {
+		/* Non-zero means allocation error
+		 * either no allocation or bad alignment value.
+		 */
+		return NULL;
+	}
+	return result;
+#else  /* This is for Linux. */
+	return memalign(alignment, size);
+#endif
+}
+
+void util_aligned_free(void *ptr)
+{
+#if defined(WITH_BLENDER_GUARDEDALLOC)
+	if(ptr != NULL) {
+		MEM_freeN(ptr);
+	}
+#elif defined(_WIN32)
+	_aligned_free(ptr);
+#else
+	free(ptr);
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_dynlib.h b/intern/cycles/util/util_aligned_malloc.h
index b30cf98c1b9..ecc0f28c376 100644
--- a/intern/cycles/util/util_dynlib.h
+++ b/intern/cycles/util/util_aligned_malloc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2015 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -11,21 +11,22 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
-#ifndef __UTIL_DYNLIB_H__
-#define __UTIL_DYNLIB_H__
+#ifndef __UTIL_ALIGNED_MALLOC_H__
+#define __UTIL_ALIGNED_MALLOC_H__
+
+#include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-struct DynamicLibrary;
+/* Allocate block of size bytes at least aligned to a given value. */
+void *util_aligned_malloc(size_t size, int alignment);
 
-DynamicLibrary *dynamic_library_open(const char *name);
-void *dynamic_library_find(DynamicLibrary *lib, const char *name);
-void dynamic_library_close(DynamicLibrary *lib);
+/* Free memory allocated by util_aligned_malloc. */
+void util_aligned_free(void *ptr);
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_DYNLIB_H__ */
-
+#endif  /* __UTIL_ALIGNED_MALLOC_H__ */
diff --git a/intern/cycles/util/util_args.h b/intern/cycles/util/util_args.h
index 55933c895bf..a53fc061758 100644
--- a/intern/cycles/util/util_args.h
+++ b/intern/cycles/util/util_args.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_ARGS_H__
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
new file mode 100644
index 00000000000..1d1e2963348
--- /dev/null
+++ b/intern/cycles/util/util_atomic.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_ATOMIC_H__
+#define __UTIL_ATOMIC_H__
+
+#ifndef __KERNEL_GPU__
+
+/* Using atomic ops header from Blender. */
+#include "atomic_ops.h"
+
+ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
+{
+	size_t prev_value = *maximum_value;
+	while(prev_value < value) {
+		if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
+			break;
+		}
+	}
+}
+
+#else  /* __KERNEL_GPU__ */
+
+#ifdef __KERNEL_OPENCL__
+
+/* Float atomics implementation credits:
+ *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
+ */
+ccl_device_inline void atomic_add_float(volatile ccl_global float *source,
+                                        const float operand)
+{
+	union {
+		unsigned int int_value;
+		float float_value;
+	} new_value;
+	union {
+		unsigned int int_value;
+		float float_value;
+	} prev_value;
+	do {
+		prev_value.float_value = *source;
+		new_value.float_value = prev_value.float_value + operand;
+	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
+	                       prev_value.int_value,
+	                       new_value.int_value) != prev_value.int_value);
+}
+
+#endif  /* __KERNEL_OPENCL__ */
+
+#endif  /* __KERNEL_GPU__ */
+
+#endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index a71e0399619..cef5adc0a61 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_BOUNDBOX_H__
diff --git a/intern/cycles/util/util_cache.cpp b/intern/cycles/util/util_cache.cpp
index e20c3a67b75..5eebfb18155 100644
--- a/intern/cycles/util/util_cache.cpp
+++ b/intern/cycles/util/util_cache.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -24,12 +24,6 @@
 #include "util_path.h"
 #include "util_types.h"
 
-#include <boost/version.hpp>
-
-#if (BOOST_VERSION < 104400)
-#  define BOOST_FILESYSTEM_VERSION 2
-#endif
-
 #include <boost/filesystem.hpp> 
 #include <boost/algorithm/string.hpp>
 
diff --git a/intern/cycles/util/util_cache.h b/intern/cycles/util/util_cache.h
index bfb2877a22b..343fa36817d 100644
--- a/intern/cycles/util/util_cache.h
+++ b/intern/cycles/util/util_cache.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_CACHE_H__
@@ -105,7 +105,7 @@ public:
 			return false;
 		}
 
-		if(!size)
+		if((size == 0) || (size % sizeof(T)) != 0)
 			return false;
 
 		data.resize(size/sizeof(T));
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 53b3d72de67..d3598f84b94 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_COLOR_H__
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 79fac506b98..6b61a49fcc3 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_DEBUG_H__
diff --git a/intern/cycles/util/util_dynlib.cpp b/intern/cycles/util/util_dynlib.cpp
deleted file mode 100644
index 587cad607c8..00000000000
--- a/intern/cycles/util/util_dynlib.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#include <stdlib.h>
-
-#include "util_dynlib.h"
-
-#ifdef _WIN32
-
-#include <windows.h>
-
-CCL_NAMESPACE_BEGIN
-
-struct DynamicLibrary {
-	HMODULE module;
-};
-
-DynamicLibrary *dynamic_library_open(const char *name)
-{
-	HMODULE module = LoadLibrary(name);
-
-	if(!module)
-		return NULL;
-
-	DynamicLibrary *lib = new DynamicLibrary();
-	lib->module = module;
-
-	return lib;
-}
-
-void *dynamic_library_find(DynamicLibrary *lib, const char *name)
-{
-	return (void*)GetProcAddress(lib->module, name);
-}
-
-void dynamic_library_close(DynamicLibrary *lib)
-{
-	FreeLibrary(lib->module);
-	delete lib;
-}
-
-CCL_NAMESPACE_END
-
-#else
-
-#include <dlfcn.h>
-
-CCL_NAMESPACE_BEGIN
-
-struct DynamicLibrary {
-	void *module;
-};
-
-DynamicLibrary *dynamic_library_open(const char *name)
-{
-	void *module = dlopen(name, RTLD_NOW);
-
-	if(!module)
-		return NULL;
-
-	DynamicLibrary *lib = new DynamicLibrary();
-	lib->module = module;
-
-	return lib;
-}
-
-void *dynamic_library_find(DynamicLibrary *lib, const char *name)
-{
-	return dlsym(lib->module, name);
-}
-
-void dynamic_library_close(DynamicLibrary *lib)
-{
-	dlclose(lib->module);
-	delete lib;
-}
-
-CCL_NAMESPACE_END
-
-#endif
-
diff --git a/intern/cycles/util/util_foreach.h b/intern/cycles/util/util_foreach.h
index df3277fbf37..4f7337107b3 100644
--- a/intern/cycles/util/util_foreach.h
+++ b/intern/cycles/util/util_foreach.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_FOREACH_H__
@@ -19,8 +19,12 @@
 
 /* Use Boost to get nice foreach() loops for STL data structures. */
 
-#include <boost/foreach.hpp>
-#define foreach BOOST_FOREACH
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  define foreach(x, y) for(x : y)
+#else
+#  include <boost/foreach.hpp>
+#  define foreach BOOST_FOREACH
+#endif
 
 #endif /* __UTIL_FOREACH_H__ */
 
diff --git a/intern/cycles/util/util_function.h b/intern/cycles/util/util_function.h
index 6aa014a08a6..6d0f0b444a9 100644
--- a/intern/cycles/util/util_function.h
+++ b/intern/cycles/util/util_function.h
@@ -11,20 +11,39 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_FUNCTION_H__
 #define __UTIL_FUNCTION_H__
 
-#include <boost/bind.hpp>
-#include <boost/function.hpp>
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <functional>
+#else
+#  include <boost/bind.hpp>
+#  include <boost/function.hpp>
+#endif
 
 CCL_NAMESPACE_BEGIN
 
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  define function_bind std::bind
+#  define function_null nullptr
+using std::function;
+using std::placeholders::_1;
+using std::placeholders::_2;
+using std::placeholders::_3;
+using std::placeholders::_4;
+using std::placeholders::_5;
+using std::placeholders::_6;
+using std::placeholders::_7;
+using std::placeholders::_8;
+using std::placeholders::_9;
+#else
 using boost::function;
-#define function_bind boost::bind
-
+#  define function_bind boost::bind
+#  define function_null NULL
+#endif
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_FUNCTION_H__ */
diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp
new file mode 100644
index 00000000000..8de6e254cbf
--- /dev/null
+++ b/intern/cycles/util/util_guarded_allocator.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util_guarded_allocator.h"
+#include "util_stats.h"
+
+CCL_NAMESPACE_BEGIN
+
+static Stats global_stats;
+
+/* Internal API. */
+
+void util_guarded_mem_alloc(size_t n)
+{
+	global_stats.mem_alloc(n);
+}
+
+void util_guarded_mem_free(size_t n)
+{
+	global_stats.mem_free(n);
+}
+
+/* Public API. */
+
+size_t util_guarded_get_mem_used(void)
+{
+	return global_stats.mem_used;
+}
+
+size_t util_guarded_get_mem_peak(void)
+{
+	return global_stats.mem_peak;
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
new file mode 100644
index 00000000000..2df717253e3
--- /dev/null
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_GUARDED_ALLOCATOR_H__
+#define __UTIL_GUARDED_ALLOCATOR_H__
+
+/* Define this in order to use Blender's guarded allocator to keep
+ * track of allocated buffers, their sizes and peak memory usage.
+ *
+ * This is usually a bad level call, but it's really handy to keep
+ * track of overall peak memory consumption during the scene
+ * synchronization step.
+ */
+#undef WITH_BLENDER_GUARDEDALLOC
+
+#include <memory>
+
+#include "util_types.h"
+
+#ifdef WITH_BLENDER_GUARDEDALLOC
+#  include "../../guardedalloc/MEM_guardedalloc.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Internal use only. */
+void util_guarded_mem_alloc(size_t n);
+void util_guarded_mem_free(size_t n);
+
+/* Guarded allocator for the use with STL. */
+template <typename T>
+class GuardedAllocator : public std::allocator<T> {
+public:
+	template<typename _Tp1>
+	struct rebind {
+		typedef GuardedAllocator<_Tp1> other;
+	};
+
+	T *allocate(size_t n, const void *hint = 0)
+	{
+		util_guarded_mem_alloc(n * sizeof(T));
+#ifdef WITH_BLENDER_GUARDEDALLOC
+		(void)hint;
+		return (T*)MEM_mallocN_aligned(n * sizeof(T), 16, "Cycles Alloc");
+#else
+		return std::allocator<T>::allocate(n, hint);
+#endif
+	}
+
+	void deallocate(T *p, size_t n)
+	{
+		util_guarded_mem_free(n * sizeof(T));
+#ifdef WITH_BLENDER_GUARDEDALLOC
+		MEM_freeN((void*)p);
+#else
+		std::allocator<T>::deallocate(p, n);
+#endif
+	}
+
+	GuardedAllocator() : std::allocator<T>() {  }
+	GuardedAllocator(const GuardedAllocator &a) : std::allocator<T>(a) { }
+	template <class U>
+	GuardedAllocator(const GuardedAllocator<U> &a) : std::allocator<T>(a) { }
+	~GuardedAllocator() { }
+};
+
+/* Get memory usage and peak from the guarded STL allocator. */
+size_t util_guarded_get_mem_used(void);
+size_t util_guarded_get_mem_peak(void);
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_GUARDED_ALLOCATOR_H__ */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 397133618be..f4bac9888a5 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_HALF_H__
@@ -56,7 +56,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 		 * assumes no negative, no nan, no inf, and sets denormal to 0 */
 		union { uint i; float f; } in;
 		float fscale = f[i] * scale;
-		in.f = (fscale > 0.0f)? ((fscale < 65500.0f)? fscale: 65500.0f): 0.0f;
+		in.f = (fscale > 0.0f)? ((fscale < 65504.0f)? fscale: 65504.0f): 0.0f;
 		int x = in.i;
 
 		int absolute = x & 0x7FFFFFFF;
@@ -68,20 +68,20 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 	}
 #else
 	/* same as above with SSE */
-	const ssef mm_scale = ssef(scale);
-	const ssei mm_38800000 = ssei(0x38800000);
-	const ssei mm_7FFF = ssei(0x7FFF);
-	const ssei mm_7FFFFFFF = ssei(0x7FFFFFFF);
-	const ssei mm_C8000000 = ssei(0xC8000000);
-
-	ssef mm_fscale = load4f(f) * mm_scale;
-	ssei x = cast(min(max(mm_fscale, ssef(0.0f)), ssef(65500.0f)));
-	ssei absolute = x & mm_7FFFFFFF;
-	ssei Z = absolute + mm_C8000000;
-	ssei result = andnot(absolute < mm_38800000, Z); 
-	ssei rh = (result >> 13) & mm_7FFF;
-
-	_mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh)));
+	ssef fscale = load4f(f) * scale;
+	ssef x = min(max(fscale, 0.0f), 65504.0f);
+
+#ifdef __KERNEL_AVX2__
+	ssei rpack = _mm_cvtps_ph(x, 0);
+#else
+	ssei absolute = cast(x) & 0x7FFFFFFF;
+	ssei Z = absolute + 0xC8000000;
+	ssei result = andnot(absolute < 0x38800000, Z);
+	ssei rshift = (result >> 13) & 0x7FFF;
+	ssei rpack = _mm_packs_epi32(rshift, rshift);
+#endif
+
+	_mm_storel_pi((__m64*)h, _mm_castsi128_ps(rpack));
 #endif
 }
 
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index edd2448efa4..3ff2802b46d 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_HASH_H__
@@ -53,7 +53,7 @@ static inline uint hash_string(const char *str)
 {
 	uint i = 0, c;
 
-	while ((c = *str++))
+	while((c = *str++))
 		i = i * 37 + c;
 
 	return i;
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index 4b8140e7a7a..bb8a31c6fec 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_IMAGE_H__
diff --git a/intern/cycles/util/util_list.h b/intern/cycles/util/util_list.h
index 2aa0b7381e3..6cb27e6defe 100644
--- a/intern/cycles/util/util_list.h
+++ b/intern/cycles/util/util_list.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_LIST_H__
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 0722f16cf45..03041723e15 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -11,15 +11,63 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <util_logging.h>
 
 #include "util_math.h"
 
+#include <stdio.h>
+#ifdef _MSC_VER
+#  define snprintf _snprintf
+#endif
+
 CCL_NAMESPACE_BEGIN
 
+void util_logging_init(const char *argv0)
+{
+#ifdef WITH_CYCLES_LOGGING
+	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
+
+	/* Make it so FATAL messages are always print into console. */
+	char severity_fatal[32];
+	snprintf(severity_fatal, sizeof(severity_fatal), "%d",
+	         google::GLOG_FATAL);
+
+	google::InitGoogleLogging(argv0);
+	SetCommandLineOption("logtostderr", "1");
+	SetCommandLineOption("v", "0");
+	SetCommandLineOption("stderrthreshold", severity_fatal);
+	SetCommandLineOption("minloglevel", severity_fatal);
+#else
+	(void) argv0;
+#endif
+}
+
+void util_logging_start(void)
+{
+#ifdef WITH_CYCLES_LOGGING
+	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
+	SetCommandLineOption("logtostderr", "1");
+	SetCommandLineOption("v", "2");
+	SetCommandLineOption("stderrthreshold", "1");
+	SetCommandLineOption("minloglevel", "0");
+#endif
+}
+
+void util_logging_verbosity_set(int verbosity)
+{
+#ifdef WITH_CYCLES_LOGGING
+	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
+	char val[10];
+	snprintf(val, sizeof(val), "%d", verbosity);
+	SetCommandLineOption("v", val);
+#else
+	(void) verbosity;
+#endif
+}
+
 std::ostream& operator <<(std::ostream &os,
                           const float3 &value)
 {
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 991789e7460..7fc42ac355a 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_LOGGING_H__
@@ -43,7 +43,11 @@ public:
 
 #endif
 
-class float3;
+struct float3;
+
+void util_logging_init(const char *argv0);
+void util_logging_start(void);
+void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
diff --git a/intern/cycles/util/util_map.h b/intern/cycles/util/util_map.h
index 77500e4712d..46c2885f8b0 100644
--- a/intern/cycles/util/util_map.h
+++ b/intern/cycles/util/util_map.h
@@ -11,20 +11,45 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_MAP_H__
 #define __UTIL_MAP_H__
 
 #include <map>
-#include <boost/tr1/unordered_map.hpp>
+
+#if defined(CYCLES_TR1_UNORDERED_MAP)
+#  include <tr1/unordered_map>
+#endif
+
+#if defined(CYCLES_STD_UNORDERED_MAP) || defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
+#  include <unordered_map>
+#endif
+
+#if !defined(CYCLES_NO_UNORDERED_MAP) && !defined(CYCLES_TR1_UNORDERED_MAP) && \
+	!defined(CYCLES_STD_UNORDERED_MAP) && !defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)  // NOLINT
+#  error One of: CYCLES_NO_UNORDERED_MAP, CYCLES_TR1_UNORDERED_MAP,\
+ CYCLES_STD_UNORDERED_MAP, CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE must be defined!  // NOLINT
+#endif
+
 
 CCL_NAMESPACE_BEGIN
 
 using std::map;
 using std::pair;
+
+#if defined(CYCLES_NO_UNORDERED_MAP)
+typedef std::map unordered_map;
+#endif
+
+#if defined(CYCLES_TR1_UNORDERED_MAP) || defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
 using std::tr1::unordered_map;
+#endif
+
+#if defined(CYCLES_STD_UNORDERED_MAP)
+using std::unordered_map;
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c332e1709db..7d6dfd34e0e 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_MATH_H__
@@ -71,6 +71,13 @@ CCL_NAMESPACE_BEGIN
 #define M_SQRT2_F	((float)1.41421356237309504880) 					/* sqrt(2) */
 #endif
 
+#ifndef M_LN2_F
+#define M_LN2_F      ((float)0.6931471805599453)        /* ln(2) */
+#endif
+
+#ifndef M_LN10_F
+#define M_LN10_F     ((float)2.3025850929940457)        /* ln(10) */
+#endif
 
 /* Scalar */
 
@@ -124,6 +131,24 @@ ccl_device_inline double min(double a, double b)
 	return (a < b)? a: b;
 }
 
+/* These 2 guys are templated for usage with registers data.
+ *
+ * NOTE: Since this is CPU-only functions it is ok to use references here.
+ * But for other devices we'll need to be careful about this.
+ */
+
+template<typename T>
+ccl_device_inline T min4(const T& a, const T& b, const T& c, const T& d)
+{
+	return min(min(a,b),min(c,d));
+}
+
+template<typename T>
+ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d)
+{
+	return max(max(a,b),max(c,d));
+}
+
 #endif
 
 ccl_device_inline float min4(float a, float b, float c, float d)
@@ -150,6 +175,15 @@ ccl_device_inline float clamp(float a, float mn, float mx)
 
 #endif
 
+#ifndef __KERNEL_CUDA__
+
+ccl_device_inline float saturate(float a)
+{
+	return clamp(a, 0.0f, 1.0f);
+}
+
+#endif
+
 ccl_device_inline int float_to_int(float f)
 {
 	return (int)f;
@@ -314,6 +348,12 @@ ccl_device_inline float2 normalize_len(const float2 a, float *t)
 	return a/(*t);
 }
 
+ccl_device_inline float2 safe_normalize(const float2 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
 	return (a.x == b.x && a.y == b.y);
@@ -510,6 +550,12 @@ ccl_device_inline float3 normalize_len(const float3 a, float *t)
 	return a/(*t);
 }
 
+ccl_device_inline float3 safe_normalize(const float3 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 #ifndef __KERNEL_OPENCL__
 
 ccl_device_inline bool operator==(const float3 a, const float3 b)
@@ -817,6 +863,12 @@ ccl_device_inline float4 normalize(const float4 a)
 	return a/len(a);
 }
 
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 ccl_device_inline float4 min(float4 a, float4 b)
 {
 #ifdef __KERNEL_SSE__
@@ -1395,10 +1447,9 @@ ccl_device bool ray_triangle_intersect_uv(
 	return true;
 }
 
-ccl_device bool ray_quad_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 quad_P, float3 quad_u, float3 quad_v,
-	float3 *isect_P, float *isect_t)
+ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_t,
+                                   float3 quad_P, float3 quad_u, float3 quad_v,
+                                   float3 *isect_P, float *isect_t)
 {
 	float3 v0 = quad_P - quad_u*0.5f - quad_v*0.5f;
 	float3 v1 = quad_P + quad_u*0.5f - quad_v*0.5f;
@@ -1414,23 +1465,52 @@ ccl_device bool ray_quad_intersect(
 }
 
 /* projections */
-ccl_device bool map_to_sphere(float *r_u, float *r_v,
-                              const float x, const float y, const float z)
+ccl_device_inline float2 map_to_tube(const float3 co)
 {
-	float len = sqrtf(x * x + y * y + z * z);
+	float len, u, v;
+	len = sqrtf(co.x * co.x + co.y * co.y);
 	if(len > 0.0f) {
-		if(UNLIKELY(x == 0.0f && y == 0.0f)) {
-			*r_u = 0.0f;  /* othwise domain error */
+		u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
+		v = (co.z + 1.0f) * 0.5f;
+	}
+	else {
+		u = v = 0.0f;
+	}
+	return make_float2(u, v);
+}
+
+ccl_device_inline float2 map_to_sphere(const float3 co)
+{
+	float l = len(co);
+	float u, v;
+	if(l > 0.0f) {
+		if(UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
+			u = 0.0f;  /* othwise domain error */
 		}
 		else {
-			*r_u = (1.0f - atan2f(x, y) / M_PI_F) / 2.0f;
+			u = (1.0f - atan2f(co.x, co.y) / M_PI_F) / 2.0f;
 		}
-		*r_v = 1.0f - safe_acosf(z / len) / M_PI_F;
-		return true;
+		v = 1.0f - safe_acosf(co.z / l) / M_PI_F;
 	}
 	else {
-		*r_v = *r_u = 0.0f; /* to avoid un-initialized variables */
-		return false;
+		u = v = 0.0f;
+	}
+	return make_float2(u, v);
+}
+
+ccl_device_inline int util_max_axis(float3 vec)
+{
+	if(vec.x > vec.y) {
+		if(vec.x > vec.z)
+			return 0;
+		else
+			return 2;
+	}
+	else {
+		if(vec.y > vec.z)
+			return 1;
+		else
+			return 2;
 	}
 }
 
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
new file mode 100644
index 00000000000..c1a1be603f4
--- /dev/null
+++ b/intern/cycles/util/util_math_fast.h
@@ -0,0 +1,611 @@
+/*
+ * Adapted from OpenImageIO library with this license:
+ *
+ * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
+ * All Rights Reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of the software's owners nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (This is the Modified BSD License)
+ *
+ * A few bits here are based upon code from NVIDIA that was also released
+ * under the same modified BSD license, and marked as:
+ *    Copyright 2004 NVIDIA Corporation. All Rights Reserved.
+ *
+ * Some parts of this file were first open-sourced in Open Shading Language,
+ * then later moved here. The original copyright notice was:
+ *    Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
+ *
+ * Many of the math functions were copied from or inspired by other
+ * public domain sources or open source packages with compatible licenses.
+ * The individual functions give references were applicable.
+ */
+
+#ifndef __UTIL_FAST_MATH__
+#define __UTIL_FAST_MATH__
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO(sergey): Make sure it does not conflict with SSE intrinsics. */
+ccl_device_inline float madd(const float a, const float b, const float c)
+{
+	/* NOTE: In the future we may want to explicitly ask for a fused
+	 * multiply-add in a specialized version for float.
+	 *
+	 * NOTE: GCC/ICC will turn this (for float) into a FMA unless
+	 * explicitly asked not to, clang seems to leave the code alone.
+	 */
+	return a * b + c;
+}
+
+/*
+ * FAST & APPROXIMATE MATH
+ *
+ * The functions named "fast_*" provide a set of replacements to libm that
+ * are much faster at the expense of some accuracy and robust handling of
+ * extreme values. One design goal for these approximation was to avoid
+ * branches as much as possible and operate on single precision values only
+ * so that SIMD versions should be straightforward ports We also try to
+ * implement "safe" semantics (ie: clamp to valid range where possible)
+ * natively since wrapping these inline calls in another layer would be
+ * wasteful.
+ *
+ * Some functions are fast_safe_*, which is both a faster approximation as
+ * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
+ */
+
+/* Round to nearest integer, returning as an int. */
+ccl_device_inline int fast_rint(float x)
+{
+	/* used by sin/cos/tan range reduction. */
+#ifdef __KERNEL_SSE4__
+	/* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
+	return float_to_int(rintf(x));
+#else
+	/* emulate rounding by adding/substracting 0.5. */
+	return float_to_int(x + copysignf(0.5f, x));
+#endif
+}
+
+ccl_device float fast_sinf(float x)
+{
+	/* Very accurate argument reduction from SLEEF,
+	 * starts failing around x=262000
+	 *
+	 * Results on: [-2pi,2pi].
+	 *
+	 * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+	 * 1.19209e-07 max error
+	 */
+	int q = fast_rint(x * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals */
+	float s = x * x;
+	if((q & 1) != 0) x = -x;
+	/* This polynomial approximation has very low error on [-pi/2,+pi/2]
+	 * 1.19209e-07 max error in total over [-2pi,+2pi]. */
+	float u = 2.6083159809786593541503e-06f;
+	u = madd(u, s, -0.0001981069071916863322258f);
+	u = madd(u, s, +0.00833307858556509017944336f);
+	u = madd(u, s, -0.166666597127914428710938f);
+	u = madd(s, u * x, x);
+	/* For large x, the argument reduction can fail and the polynomial can be
+	 * evaluated with arguments outside the valid internal. Just clamp the bad
+	 * values away (setting to 0.0f means no branches need to be generated). */
+	if(fabsf(u) > 1.0f) {
+		u = 0.0f;
+	}
+	return u;
+}
+
+ccl_device float fast_cosf(float x)
+{
+	/* Same argument reduction as fast_sinf(). */
+	int q = fast_rint(x * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals. */
+	float s = x * x;
+	/* Polynomial from SLEEF's sincosf, max error is
+	 * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
+	float u = -2.71811842367242206819355e-07f;
+	u = madd(u, s, +2.47990446951007470488548e-05f);
+	u = madd(u, s, -0.00138888787478208541870117f);
+	u = madd(u, s, +0.0416666641831398010253906f);
+	u = madd(u, s, -0.5f);
+	u = madd(u, s, +1.0f);
+	if((q & 1) != 0) {
+		u = -u;
+	}
+	if(fabsf(u) > 1.0f) {
+		u = 0.0f;
+	}
+	return u;
+}
+
+ccl_device void fast_sincosf(float x, float* sine, float* cosine)
+{
+	/* Same argument reduction as fast_sin. */
+	int q = fast_rint(x * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
+	float s = x * x;
+	/* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
+	if((q & 1) != 0) {
+		x = -x;
+	}
+	float su = 2.6083159809786593541503e-06f;
+	su = madd(su, s, -0.0001981069071916863322258f);
+	su = madd(su, s, +0.00833307858556509017944336f);
+	su = madd(su, s, -0.166666597127914428710938f);
+	su = madd(s, su * x, x);
+	float cu = -2.71811842367242206819355e-07f;
+	cu = madd(cu, s, +2.47990446951007470488548e-05f);
+	cu = madd(cu, s, -0.00138888787478208541870117f);
+	cu = madd(cu, s, +0.0416666641831398010253906f);
+	cu = madd(cu, s, -0.5f);
+	cu = madd(cu, s, +1.0f);
+	if((q & 1) != 0) {
+		cu = -cu;
+	}
+	if(fabsf(su) > 1.0f) {
+		su = 0.0f;
+	}
+	if(fabsf(cu) > 1.0f) {
+		cu = 0.0f;
+	}
+	*sine   = su;
+	*cosine = cu;
+}
+
+/* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
+ * becoming really poor outside of this range because the reciprocal amplifies
+ * errors.
+ */
+ccl_device float fast_tanf(float x)
+{
+	/* Derived from SLEEF implementation.
+	 *
+	 * Note that we cannot apply the "denormal crush" trick everywhere because
+	 * we sometimes need to take the reciprocal of the polynomial
+	 */
+	int q = fast_rint(x * 2.0f * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*2, x);
+	x = madd(qf, -0.00024187564849853515625f*2, x);
+	x = madd(qf, -3.7747668102383613586e-08f*2, x);
+	x = madd(qf, -1.2816720341285448015e-12f*2, x);
+	if((q & 1) == 0) {
+		/* Crush denormals (only if we aren't inverting the result later). */
+		x = M_PI_4_F - (M_PI_4_F - x);
+	}
+	float s = x * x;
+	float u = 0.00927245803177356719970703f;
+	u = madd(u, s, 0.00331984995864331722259521f);
+	u = madd(u, s, 0.0242998078465461730957031f);
+	u = madd(u, s, 0.0534495301544666290283203f);
+	u = madd(u, s, 0.133383005857467651367188f);
+	u = madd(u, s, 0.333331853151321411132812f);
+	u = madd(s, u * x, x);
+	if((q & 1) != 0) {
+		u = -1.0f / u;
+	}
+	return u;
+}
+
+/* Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
+ *
+ * Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
+ */
+ccl_device float fast_sinpif(float x)
+{
+	/* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
+	const float z = x - ((x + 25165824.0f) - 25165824.0f);
+	const float y = z - z * fabsf(z);
+	const float Q = 3.10396624f;
+	const float P = 3.584135056f;  /* P = 16-4*Q */
+	return y * (Q + P * fabsf(y));
+
+	/* The original article used used inferior constants for Q and P and
+	 * so had max error 1.091e-3.
+	 *
+	 * The optimal value for Q was determined by exhaustive search, minimizing
+	 * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
+	 * over the interval [0,2] (which is where most of the invocations happen).
+	 *
+	 * The basic idea of this approximation starts with the coarse approximation:
+	 *      sin(pi*x) ~= f(x) =  4 * (x - x * abs(x))
+	 *
+	 * This approximation always _over_ estimates the target. On the otherhand,
+	 * the curve:
+	 *      sin(pi*x) ~= f(x) * abs(f(x)) / 4
+	 *
+	 * always lies _under_ the target. Thus we can simply numerically search for
+	 * the optimal constant to LERP these curves into a more precise
+	 * approximation.
+	 *
+	 * After folding the constants together and simplifying the resulting math,
+	 * we end up with the compact implementation above.
+	 *
+	 * NOTE: this function actually computes sin(x * pi) which avoids one or two
+	 * mults in many cases and guarantees exact values at integer periods.
+	 */
+}
+
+/* Fast approximate cos(x*M_PI) with ~0.1% absolute error. */
+ccl_device_inline float fast_cospif(float x)
+{
+	return fast_sinpif(x+0.5f);
+}
+
+ccl_device float fast_acosf(float x)
+{
+	const float f = fabsf(x);
+	/* clamp and crush denormals. */
+	const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
+	/* Based on http://www.pouet.net/topic.php?which=9132&page=2
+	 * 85% accurate (ulp 0)
+	 * Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+	 * Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+	 */
+	const float a = sqrtf(1.0f - m) *
+		(1.5707963267f + m * (-0.213300989f + m *
+		                      (0.077980478f + m * -0.02164095f)));
+	return x < 0 ? M_PI_F - a : a;
+}
+
+ccl_device float fast_asinf(float x)
+{
+	/* Based on acosf approximation above.
+	 * Max error is 4.51133e-05 (ulps are higher because we are consistently off
+	 * by a little amount).
+	 */
+	const float f = fabsf(x);
+	/* Clamp and crush denormals. */
+	const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
+	const float a = M_PI_2_F - sqrtf(1.0f - m) *
+		(1.5707963267f + m * (-0.213300989f + m *
+		                      (0.077980478f + m * -0.02164095f)));
+	return copysignf(a, x);
+}
+
+ccl_device float fast_atanf(float x)
+{
+	const float a = fabsf(x);
+	const float k = a > 1.0f ? 1 / a : a;
+	const float s = 1.0f - (1.0f - k);  /* Crush denormals. */
+	const float t = s * s;
+	/* http://mathforum.org/library/drmath/view/62672.html
+	 * Examined 4278190080 values of atan: 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+	 * Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+	 */
+	float r = s * madd(0.43157974f, t, 1.0f) /
+	              madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
+	if(a > 1.0f) {
+		/* TODO(sergey): Is it M_PI_2_F? */
+		r = 1.570796326794896557998982f - r;
+	}
+	return copysignf(r, x);
+}
+
+ccl_device float fast_atan2f(float y, float x)
+{
+	/* Based on atan approximation above.
+	 *
+	 * The special cases around 0 and infinity were tested explicitly.
+	 *
+	 * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
+	 * of nan.
+	 */
+	const float a = fabsf(x);
+	const float b = fabsf(y);
+
+	const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
+	const float s = 1.0f - (1.0f - k);  /* Crush denormals */
+	const float t = s * s;
+
+	float r = s * madd(0.43157974f, t, 1.0f) /
+	              madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
+
+	if(b > a) {
+		/* Account for arg reduction. */
+		/* TODO(sergey): Is it M_PI_2_F? */
+		r = 1.570796326794896557998982f - r;
+	}
+	/* Test sign bit of x. */
+	if(__float_as_uint(x) & 0x80000000u) {
+		r = M_PI_F - r;
+	}
+	return copysignf(r, y);
+}
+
+/* Based on:
+ *
+ *   https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
+ *
+ */
+ccl_device float fast_log2f(float x)
+{
+	/* NOTE: clamp to avoid special cases and make result "safe" from large
+	 * negative values/nans. */
+	x = clamp(x, FLT_MIN, FLT_MAX);
+	unsigned bits = __float_as_uint(x);
+	int exponent = (int)(bits >> 23) - 127;
+	float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
+	/* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
+	 * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
+	 * ulp histogram:
+	 *  0  = 97.46%
+	 *  1  =  2.29%
+	 *  2  =  0.11%
+	 */
+	float f2 = f * f;
+	float f4 = f2 * f2;
+	float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
+	float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
+	hi = madd(f, hi, -0.13753123777116f);
+	hi = madd(f, hi,  0.24187369696082f);
+	hi = madd(f, hi, -0.34730547155299f);
+	lo = madd(f, lo,  1.442689881667200f);
+	return ((f4 * hi) + (f * lo)) + exponent;
+}
+
+ccl_device_inline float fast_logf(float x)
+{
+	/* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
+	 * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
+	 */
+	return fast_log2f(x) * M_LN2_F;
+}
+
+ccl_device_inline float fast_log10(float x)
+{
+	/* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
+	 * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
+	 */
+	return fast_log2f(x) * M_LN2_F / M_LN10_F;
+}
+
+ccl_device float fast_logb(float x)
+{
+	/* Don't bother with denormals. */
+	x = fabsf(x);
+	x = clamp(x, FLT_MIN, FLT_MAX);
+	unsigned bits = __float_as_uint(x);
+	return (int)(bits >> 23) - 127;
+}
+
+ccl_device float fast_exp2f(float x)
+{
+	/* Clamp to safe range for final addition. */
+	x = clamp(x, -126.0f, 126.0f);
+	/* Range reduction. */
+	int m = (int)x; x -= m;
+	x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
+	/* 5th degree polynomial generated with sollya
+	 * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
+	 * 232 max ulp.
+	 *
+	 * ulp histogram:
+	 *  0  = 87.81%
+	 *  1  =  4.18%
+	 */
+	float r = 1.33336498402e-3f;
+	r = madd(x, r, 9.810352697968e-3f);
+	r = madd(x, r, 5.551834031939e-2f);
+	r = madd(x, r, 0.2401793301105f);
+	r = madd(x, r, 0.693144857883f);
+	r = madd(x, r, 1.0f);
+	/* Multiply by 2 ^ m by adding in the exponent. */
+	/* NOTE: left-shift of negative number is undefined behavior. */
+	return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
+}
+
+ccl_device_inline float fast_expf(float x)
+{
+	/* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
+	 * 2.6666452 avg ulp diff, 230 max ulp.
+	 */
+	return fast_exp2f(x / M_LN2_F);
+}
+
+ccl_device_inline float fast_exp10(float x)
+{
+	/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
+	 * 2.71732409 avg ulp diff, 232 max ulp.
+	 */
+	return fast_exp2f(x * M_LN10_F / M_LN2_F);
+}
+
+ccl_device_inline float fast_expm1f(float x)
+{
+	if(fabsf(x) < 1e-5f) {
+		x = 1.0f - (1.0f - x);  /* Crush denormals. */
+		return madd(0.5f, x * x, x);
+	}
+	else {
+		return fast_expf(x) - 1.0f;
+	}
+}
+
+ccl_device float fast_sinhf(float x)
+{
+	float a = fabsf(x);
+	if(a > 1.0f) {
+		/* Examined 53389559 values of sinh on [1,87.3300018]:
+		 * 33.6886442 avg ulp diff, 178 max ulp. */
+		float e = fast_expf(a);
+		return copysignf(0.5f * e - 0.5f / e, x);
+	}
+	else {
+		a = 1.0f - (1.0f - a);  /* Crush denorms. */
+		float a2 = a * a;
+		/* Degree 7 polynomial generated with sollya. */
+		/* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
+		float r = 2.03945513931e-4f;
+		r = madd(r, a2, 8.32990277558e-3f);
+		r = madd(r, a2, 0.1666673421859f);
+		r = madd(r * a, a2, a);
+		return copysignf(r, x);
+	}
+}
+
+ccl_device_inline float fast_coshf(float x)
+{
+	/* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
+	 * 1.78256726 avg ulp diff, 178 max ulp.
+	 */
+	float e = fast_expf(fabsf(x));
+	return 0.5f * e + 0.5f / e;
+}
+
+ccl_device_inline float fast_tanhf(float x)
+{
+	/* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
+	 * 3.12924e-06 max error.
+	 */
+	/* NOTE: ulp error is high because of sub-optimal handling around the origin. */
+	float e = fast_expf(2.0f * fabsf(x));
+	return copysignf(1.0f - 2.0f / (1.0f + e), x);
+}
+
+ccl_device float fast_safe_powf(float x, float y)
+{
+	if(y == 0) return 1.0f;  /* x^1=1 */
+	if(x == 0) return 0.0f;  /* 0^y=0 */
+	float sign = 1.0f;
+	if(x < 0.0f) {
+		/* if x is negative, only deal with integer powers
+		 * powf returns NaN for non-integers, we will return 0 instead.
+		 */
+		int ybits = __float_as_int(y) & 0x7fffffff;
+		if(ybits >= 0x4b800000) {
+			// always even int, keep positive
+		}
+		else if(ybits >= 0x3f800000) {
+			/* Bigger than 1, check. */
+			int k = (ybits >> 23) - 127;  /* Get exponent. */
+			int j =  ybits >> (23 - k);   /* Shift out possible fractional bits. */
+			if((j << (23 - k)) == ybits) {  /* rebuild number and check for a match. */
+				/* +1 for even, -1 for odd. */
+				sign = __int_as_float(0x3f800000 | (j << 31));
+			}
+			else {
+				/* Not an integer. */
+				return 0.0f;
+			}
+		}
+		else {
+			/* Not an integer. */
+			return 0.0f;
+		}
+	}
+	return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
+}
+
+/* TODO(sergey): Check speed  with our erf functions implementation from
+ * bsdf_microfaset.h.
+ */
+
+ccl_device_inline float fast_erff(float x)
+{
+	/* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
+	/* Abramowitz and Stegun, 7.1.28. */
+	const float a1 = 0.0705230784f;
+	const float a2 = 0.0422820123f;
+	const float a3 = 0.0092705272f;
+	const float a4 = 0.0001520143f;
+	const float a5 = 0.0002765672f;
+	const float a6 = 0.0000430638f;
+	const float a = fabsf(x);
+	const float b = 1.0f - (1.0f - a);  /* Crush denormals. */
+	const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
+	const float s = r * r;  /* ^2 */
+	const float t = s * s;  /* ^4 */
+	const float u = t * t;  /* ^8 */
+	const float v = u * u;  /* ^16 */
+	return copysignf(1.0f - 1.0f / v, x);
+}
+
+ccl_device_inline float fast_erfcf(float x)
+{
+	/* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
+	 *
+	 * ulp histogram:
+	 *
+	 *  0  = 80.30%
+	 */
+	return 1.0f - fast_erff(x);
+}
+
+ccl_device_inline float fast_ierff(float x)
+{
+	/* From: Approximating the erfinv function by Mike Giles. */
+	/* To avoid trouble at the limit, clamp input to 1-eps. */
+	float a = fabsf(x);
+	if(a > 0.99999994f) {
+		a = 0.99999994f;
+	}
+	float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
+	if(w < 5.0f) {
+		w = w - 2.5f;
+		p =  2.81022636e-08f;
+		p = madd(p, w,  3.43273939e-07f);
+		p = madd(p, w, -3.5233877e-06f);
+		p = madd(p, w, -4.39150654e-06f);
+		p = madd(p, w,  0.00021858087f);
+		p = madd(p, w, -0.00125372503f);
+		p = madd(p, w, -0.00417768164f);
+		p = madd(p, w,  0.246640727f);
+		p = madd(p, w,  1.50140941f);
+	}
+	else {
+		w = sqrtf(w) - 3.0f;
+		p = -0.000200214257f;
+		p = madd(p, w,  0.000100950558f);
+		p = madd(p, w,  0.00134934322f);
+		p = madd(p, w, -0.00367342844f);
+		p = madd(p, w,  0.00573950773f);
+		p = madd(p, w, -0.0076224613f);
+		p = madd(p, w,  0.00943887047f);
+		p = madd(p, w,  1.00167406f);
+		p = madd(p, w,  2.83297682f);
+	}
+	return p * x;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_FAST_MATH__ */
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index add0d18c742..b2a32c45287 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -152,8 +152,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
 #define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + F(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + F(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	/* Do the following 16 operations. */
 	SET(a, b, c, d,  0,  7,  T1);
 	SET(d, a, b, c,  1, 12,  T2);
@@ -178,8 +178,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
 #define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + G(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + G(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	 /* Do the following 16 operations. */
 	SET(a, b, c, d,  1,  5, T17);
 	SET(d, a, b, c,  6,  9, T18);
@@ -230,8 +230,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
 #define I(x, y, z) ((y) ^ ((x) | ~(z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + I(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + I(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	 /* Do the following 16 operations. */
 	SET(a, b, c, d,  0,  6, T49);
 	SET(d, a, b, c,  7, 10, T50);
diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h
index 04a3e039c9d..0b5462e0a09 100644
--- a/intern/cycles/util/util_opengl.h
+++ b/intern/cycles/util/util_opengl.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_OPENGL_H__
@@ -20,7 +20,12 @@
 /* OpenGL header includes, used everywhere we use OpenGL, to deal with
  * platform differences in one central place. */
 
-#include <GL/glew.h>
+#ifdef WITH_GLEW_MX
+#  include "glew-mx.h"
+#else
+#  include <GL/glew.h>
+#  define mxCreateContext() glewInit()
+#  define mxMakeCurrentContext(x) (x)
+#endif
 
 #endif /* __UTIL_OPENGL_H__ */
-
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 2feb3d6ab7e..c951c35fc76 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_OPTIMIZATION_H__
@@ -102,34 +102,16 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #else
-
-#ifdef __KERNEL_SSE2__
-#include <xmmintrin.h> /* SSE 1 */
-#include <emmintrin.h> /* SSE 2 */
-#endif
-
-#ifdef __KERNEL_SSE3__
-#include <pmmintrin.h> /* SSE 3 */
-#endif
-
-#ifdef __KERNEL_SSSE3__
-#include <tmmintrin.h> /* SSSE 3 */
-#endif
-
-#ifdef __KERNEL_SSE41__
-#include <smmintrin.h> /* SSE 4.1 */
-#endif
-
-#ifdef __KERNEL_AVX__
-#include <immintrin.h> /* AVX */
-#endif
-
+#include <x86intrin.h>
 #endif
 
 #else
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
+#define NOGDI
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 
 #endif
diff --git a/intern/cycles/util/util_param.h b/intern/cycles/util/util_param.h
index ef20ff0fcd2..69bcbf80a78 100644
--- a/intern/cycles/util/util_param.h
+++ b/intern/cycles/util/util_param.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_PARAM_H__
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 85d19b6a325..e8f1ec81763 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "util_debug.h"
@@ -25,12 +25,6 @@ OIIO_NAMESPACE_USING
 
 #include <stdio.h>
 
-#include <boost/version.hpp>
-
-#if (BOOST_VERSION < 104400)
-#  define BOOST_FILESYSTEM_VERSION 2
-#endif
-
 #include <boost/filesystem.hpp> 
 #include <boost/algorithm/string.hpp>
 
@@ -41,21 +35,31 @@ static string cached_user_path = "";
 
 static boost::filesystem::path to_boost(const string& path)
 {
-#ifdef _MSC_VER
-	std::wstring path_utf16 = Strutil::utf8_to_utf16(path.c_str());
-	return boost::filesystem::path(path_utf16.c_str());
-#else
 	return boost::filesystem::path(path.c_str());
-#endif
 }
 
 static string from_boost(const boost::filesystem::path& path)
 {
-#ifdef _MSC_VER
-	return Strutil::utf16_to_utf8(path.wstring().c_str());
-#else
 	return path.string().c_str();
-#endif
+}
+
+static char *path_specials(const string& sub)
+{
+	static bool env_init = false;
+	static char *env_shader_path;
+	static char *env_kernel_path;
+	if(!env_init) {
+		env_shader_path = getenv("CYCLES_SHADER_PATH");
+		env_kernel_path = getenv("CYCLES_KERNEL_PATH");
+		env_init = true;
+	}
+	if(env_shader_path != NULL && sub == "shader") {
+		return env_shader_path;
+	}
+	else if(env_shader_path != NULL && sub == "kernel") {
+		return env_kernel_path;
+	}
+	return NULL;
 }
 
 void path_init(const string& path, const string& user_path)
@@ -71,6 +75,10 @@ void path_init(const string& path, const string& user_path)
 
 string path_get(const string& sub)
 {
+	char *special = path_specials(sub);
+	if(special != NULL)
+		return special;
+
 	if(cached_path == "")
 		cached_path = path_dirname(Sysutil::this_program_path());
 
@@ -87,11 +95,7 @@ string path_user_get(const string& sub)
 
 string path_filename(const string& path)
 {
-#if (BOOST_FILESYSTEM_VERSION == 2)
-	return to_boost(path).filename();
-#else
 	return from_boost(to_boost(path).filename());
-#endif
 }
 
 string path_dirname(const string& path)
@@ -259,14 +263,7 @@ string path_source_replace_includes(const string& source_, const string& path)
 
 FILE *path_fopen(const string& path, const string& mode)
 {
-#ifdef _WIN32
-	std::wstring path_utf16 = Strutil::utf8_to_utf16(path);
-	std::wstring mode_utf16 = Strutil::utf8_to_utf16(mode);
-
-	return _wfopen(path_utf16.c_str(), mode_utf16.c_str());
-#else
 	return fopen(path.c_str(), mode.c_str());
-#endif
 }
 
 void path_cache_clear_except(const string& name, const set<string>& except)
@@ -277,11 +274,7 @@ void path_cache_clear_except(const string& name, const set<string>& except)
 		boost::filesystem::directory_iterator it(dir), it_end;
 
 		for(; it != it_end; it++) {
-#if (BOOST_FILESYSTEM_VERSION == 2)
-			string filename = from_boost(it->path().filename());
-#else
 			string filename = from_boost(it->path().filename().string());
-#endif
 
 			if(boost::starts_with(filename, name))
 				if(except.find(filename) == except.end())
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index fd9ea11740d..b81d71d1c0f 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_PATH_H__
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index e721a3f5047..0b35142ddb3 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_PROGRESS_H__
@@ -38,15 +38,18 @@ public:
 		sample = 0;
 		start_time = time_dt();
 		total_time = 0.0f;
+		render_time = 0.0f;
 		tile_time = 0.0f;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
 		sync_substatus = "";
-		update_cb = NULL;
+		update_cb = function_null;
 		cancel = false;
 		cancel_message = "";
-		cancel_cb = NULL;
+		error = false;
+		error_message = "";
+		cancel_cb = function_null;
 	}
 
 	Progress(Progress& progress)
@@ -59,7 +62,7 @@ public:
 		thread_scoped_lock lock(progress.progress_mutex);
 
 		progress.get_status(status, substatus);
-		progress.get_tile(tile, total_time, tile_time);
+		progress.get_tile(tile, total_time, render_time, tile_time);
 
 		sample = progress.get_sample();
 
@@ -71,7 +74,9 @@ public:
 		tile = 0;
 		sample = 0;
 		start_time = time_dt();
+		render_start_time = time_dt();
 		total_time = 0.0f;
+		render_time = 0.0f;
 		tile_time = 0.0f;
 		status = "Initializing";
 		substatus = "";
@@ -79,6 +84,8 @@ public:
 		sync_substatus = "";
 		cancel = false;
 		cancel_message = "";
+		error = false;
+		error_message = "";
 	}
 
 	/* cancel */
@@ -103,11 +110,33 @@ public:
 		return cancel_message;
 	}
 
-	void set_cancel_callback(boost::function<void(void)> function)
+	void set_cancel_callback(function<void(void)> function)
 	{
 		cancel_cb = function;
 	}
 
+	/* error */
+	void set_error(const string& error_message_)
+	{
+		thread_scoped_lock lock(progress_mutex);
+		error_message = error_message_;
+		error = true;
+		/* If error happens we also stop rendering. */
+		cancel_message = error_message_;
+		cancel = true;
+	}
+
+	bool get_error()
+	{
+		return error;
+	}
+
+	string get_error_message()
+	{
+		thread_scoped_lock lock(progress_mutex);
+		return error_message;
+	}
+
 	/* tile and timing information */
 
 	void set_start_time(double start_time_)
@@ -117,24 +146,39 @@ public:
 		start_time = start_time_;
 	}
 
+	void set_render_start_time(double render_start_time_)
+	{
+		thread_scoped_lock lock(progress_mutex);
+
+		render_start_time = render_start_time_;
+	}
+
 	void set_tile(int tile_, double tile_time_)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
 		tile = tile_;
 		total_time = time_dt() - start_time;
+		render_time = time_dt() - render_start_time;
 		tile_time = tile_time_;
 	}
 
-	void get_tile(int& tile_, double& total_time_, double& tile_time_)
+	void get_tile(int& tile_, double& total_time_, double& render_time_, double& tile_time_)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
 		tile_ = tile;
 		total_time_ = (total_time > 0.0)? total_time: 0.0;
+		render_time_ = (render_time > 0.0)? render_time: 0.0;
 		tile_time_ = tile_time;
 	}
 
+	void get_time(double& total_time_, double& render_time_)
+	{
+		total_time_ = (total_time > 0.0)? total_time: 0.0;
+		render_time_ = (render_time > 0.0)? render_time: 0.0;
+	}
+
 	void reset_sample()
 	{
 		thread_scoped_lock lock(progress_mutex);
@@ -169,6 +213,7 @@ public:
 			status = status_;
 			substatus = substatus_;
 			total_time = time_dt() - start_time;
+			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -180,6 +225,7 @@ public:
 			thread_scoped_lock lock(progress_mutex);
 			substatus = substatus_;
 			total_time = time_dt() - start_time;
+			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -192,6 +238,7 @@ public:
 			sync_status = status_;
 			sync_substatus = substatus_;
 			total_time = time_dt() - start_time;
+			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -204,6 +251,7 @@ public:
 			thread_scoped_lock lock(progress_mutex);
 			sync_substatus = substatus_;
 			total_time = time_dt() - start_time;
+			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -233,7 +281,7 @@ public:
 		}
 	}
 
-	void set_update_callback(boost::function<void(void)> function)
+	void set_update_callback(function<void(void)> function)
 	{
 		update_cb = function;
 	}
@@ -241,14 +289,14 @@ public:
 protected:
 	thread_mutex progress_mutex;
 	thread_mutex update_mutex;
-	boost::function<void(void)> update_cb;
-	boost::function<void(void)> cancel_cb;
+	function<void(void)> update_cb;
+	function<void(void)> cancel_cb;
 
 	int tile;    /* counter for rendered tiles */
 	int sample;  /* counter of rendered samples, global for all tiles */
 
-	double start_time;
-	double total_time;
+	double start_time, render_start_time;
+	double total_time, render_time;
 	double tile_time;
 
 	string status;
@@ -259,6 +307,9 @@ protected:
 
 	volatile bool cancel;
 	string cancel_message;
+
+	volatile bool error;
+	string error_message;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_set.h b/intern/cycles/util/util_set.h
index 6078114e714..b3cb8dd8af5 100644
--- a/intern/cycles/util/util_set.h
+++ b/intern/cycles/util/util_set.h
@@ -11,20 +11,26 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SET_H__
 #define __UTIL_SET_H__
 
 #include <set>
-#include <boost/tr1/unordered_set.hpp>
-
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <unordered_set>
+#else
+#  include <boost/tr1/unordered_set.hpp>
+#endif
 CCL_NAMESPACE_BEGIN
 
 using std::set;
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+using std::unordered_set;
+#else
 using std::tr1::unordered_set;
-
+#endif
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SET_H__ */
diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp
index 0436823e62a..eb9e32800e1 100644
--- a/intern/cycles/util/util_simd.cpp
+++ b/intern/cycles/util/util_simd.cpp
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifdef WITH_KERNEL_SSE2
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 39506a6359b..a1c35b7174d 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SIMD_TYPES_H__
@@ -58,8 +58,12 @@ __forceinline operator          int      ( ) const { return std::numeric_limits<
 /* Intrinsics Functions */
 
 #if defined(__BMI__) && defined(__GNUC__)
-#define _tzcnt_u32 __tzcnt_u32
-#define _tzcnt_u64 __tzcnt_u64
+#  ifndef _tzcnt_u32
+#    define _tzcnt_u32 __tzcnt_u32
+#  endif
+#  ifndef _tzcnt_u64
+#    define _tzcnt_u64 __tzcnt_u64
+#  endif
 #endif
 
 #if defined(__LZCNT__)
@@ -133,7 +137,7 @@ __forceinline int clz(const int x)
 #if defined(__KERNEL_AVX2__)
   return _lzcnt_u32(x);
 #else
-  if (UNLIKELY(x == 0)) return 32;
+  if(UNLIKELY(x == 0)) return 32;
   return 31 - __bsr(x);    
 #endif
 }
@@ -286,7 +290,7 @@ __forceinline int clz(const int x)
 #if defined(__KERNEL_AVX2__)
   return _lzcnt_u32(x);
 #else
-  if (UNLIKELY(x == 0)) return 32;
+  if(UNLIKELY(x == 0)) return 32;
   return 31 - __bsr(x);    
 #endif
 }
@@ -358,7 +362,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
   char* _r = (char*)(&rvalue + 1);
   char* _v = (char*)(& value + 1);
   char* _i = (char*)(& input + 1);
-  for ( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
+  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
   return rvalue;
 }
 
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index be510256dd3..6e669701f3b 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SSEB_H__
@@ -119,14 +119,29 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb sh
 	return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) {
+	return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
+	return _mm_movehl_ps(a, a);
+}
+
 template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
 	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
+	return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
+	return _mm_movehl_ps(b, a);
+}
+
 #if defined(__KERNEL_SSE3__)
 template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
 template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) { return _mm_castpd_ps(_mm_movedup_pd (a)); }
 #endif
 
 #if defined(__KERNEL_SSE41__)
@@ -153,6 +168,16 @@ __forceinline bool none      ( const sseb& b ) { return _mm_movemask_ps(b) == 0x
 
 __forceinline size_t movemask( const sseb& a ) { return _mm_movemask_ps(a); }
 
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_sseb(const char *label, const sseb &a)
+{
+	printf("%s: %df %df %df %d\n",
+	       label, a[0], a[1], a[2], a[3]);
+}
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index f4236cc616e..e625fa63568 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SSEF_H__
@@ -119,6 +119,9 @@ __forceinline const ssef operator^(const ssef& a, const ssei& b) { return _mm_xo
 __forceinline const ssef operator&(const ssef& a, const ssef& b) { return _mm_and_ps(a.m128,b.m128); }
 __forceinline const ssef operator&(const ssef& a, const ssei& b) { return _mm_and_ps(a.m128,_mm_castsi128_ps(b.m128)); }
 
+__forceinline const ssef operator|(const ssef& a, const ssef& b) { return _mm_or_ps(a.m128,b.m128); }
+__forceinline const ssef operator|(const ssef& a, const ssei& b) { return _mm_or_ps(a.m128,_mm_castsi128_ps(b.m128)); }
+
 __forceinline const ssef andnot(const ssef& a, const ssef& b) { return _mm_andnot_ps(a.m128,b.m128); }
 
 __forceinline const ssef min(const ssef& a, const ssef& b) { return _mm_min_ps(a.m128,b.m128); }
@@ -159,8 +162,8 @@ __forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { re
 #else
 __forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return a*b+c; }
 __forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return a*b-c; }
-__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return -a*b-c;}
-__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return c-a*b; }
+__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return c-a*b;}
+__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return -a*b-c; }
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -267,10 +270,30 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef sh
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
 }
 
+template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef& a) {
+	return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef& a) {
+	return _mm_movehl_ps(a, a);
+}
+
 template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef& a, const ssef& b) {
 	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
+template<size_t i0> __forceinline const ssef shuffle(const ssef& a, const ssef& b) {
+	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
+}
+
+template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef& a, const ssef& b) {
+	return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef& a, const ssef& b) {
+	return _mm_movehl_ps(b, a);
+}
+
 #if defined(__KERNEL_SSSE3__)
 __forceinline const ssef shuffle8(const ssef& a, const ssei& shuf) { 
 	return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); 
@@ -280,18 +303,19 @@ __forceinline const ssef shuffle8(const ssef& a, const ssei& shuf) {
 #if defined(__KERNEL_SSE3__)
 template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef& b) { return _mm_moveldup_ps(b); }
 template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef& b) { return _mm_movehdup_ps(b); }
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef& b) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); }
 #endif
 
 template<size_t i0> __forceinline const ssef shuffle(const ssef& b) {
 	return shuffle<i0,i0,i0,i0>(b);
 }
 
-#if defined(__KERNEL_SSE41__) && !defined(__GNUC__)
-template<size_t i> __forceinline float extract  (const ssef& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
-#else
-template<size_t i> __forceinline float extract  (const ssef& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
+#if defined(__KERNEL_AVX__)
+__forceinline const ssef shuffle(const ssef& a, const ssei& shuf) {
+	return _mm_permutevar_ps(a, shuf);
+}
 #endif
+
+template<size_t i> __forceinline float extract   (const ssef& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
 template<>         __forceinline float extract<0>(const ssef& a) { return _mm_cvtss_f32(a); }
 
 #if defined(__KERNEL_SSE41__)
@@ -347,6 +371,8 @@ __forceinline size_t select_max(const ssef& v) { return __bsf(movemask(v == vred
 __forceinline size_t select_min(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(pos_inf)); return __bsf(movemask(valid &(a == vreduce_min(a)))); }
 __forceinline size_t select_max(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(neg_inf)); return __bsf(movemask(valid &(a == vreduce_max(a)))); }
 
+__forceinline size_t movemask( const ssef& a ) { return _mm_movemask_ps(a); }
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Memory load and store operations
 ////////////////////////////////////////////////////////////////////////////////
@@ -580,6 +606,20 @@ ccl_device_inline const ssef set_sign_bit(const ssef &a)
 	return a ^ cast(ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
 }
 
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_ssef(const char *label, const ssef &a)
+{
+	printf("%s: %.8f %.8f %.8f %.8f\n",
+	       label,
+	       (double)a[0],
+	       (double)a[1],
+	       (double)a[2],
+	       (double)a[3]);
+}
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f5a8686e35..5f62569268c 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SSEI_H__
@@ -190,8 +190,8 @@ __forceinline const ssei select( const int mask, const ssei& t, const ssei& f )
 // Movement/Shifting/Shuffling Functions
 ////////////////////////////////////////////////////////////////////////////////
 
-__forceinline ssei unpacklo( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); }
-__forceinline ssei unpackhi( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); }
+__forceinline ssei unpacklo( const ssei& a, const ssei& b ) { return _mm_unpacklo_epi32(a, b); }
+__forceinline ssei unpackhi( const ssei& a, const ssei& b ) { return _mm_unpackhi_epi32(a, b); }
 
 template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle( const ssei& a ) {
 	return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
@@ -201,12 +201,6 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei sh
 	return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
 }
 
-#if defined(__KERNEL_SSE3__)
-template<> __forceinline const ssei shuffle<0, 0, 2, 2>( const ssei& a ) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a))); }
-template<> __forceinline const ssei shuffle<1, 1, 3, 3>( const ssei& a ) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a))); }
-template<> __forceinline const ssei shuffle<0, 1, 0, 1>( const ssei& a ) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(a))); }
-#endif
-
 template<size_t i0> __forceinline const ssei shuffle( const ssei& b ) {
 	return shuffle<i0,i0,i0,i0>(b);
 }
@@ -286,6 +280,16 @@ __forceinline void store4i_nt(void* ptr, const ssei& v) {
 #endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_ssei(const char *label, const ssei &a)
+{
+	printf("%s: %df %df %df %d\n",
+	       label, a[0], a[1], a[2], a[3]);
+}
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 8758b823084..ce27067dc5e 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -11,12 +11,14 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_STATS_H__
 #define __UTIL_STATS_H__
 
+#include "util_atomic.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Stats {
@@ -24,14 +26,13 @@ public:
 	Stats() : mem_used(0), mem_peak(0) {}
 
 	void mem_alloc(size_t size) {
-		mem_used += size;
-		if(mem_used > mem_peak)
-			mem_peak = mem_used;
+		atomic_add_z(&mem_used, size);
+		atomic_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
 		assert(mem_used >= size);
-		mem_used -= size;
+		atomic_sub_z(&mem_used, size);
 	}
 
 	size_t mem_used;
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index f38d8d3282f..66856dd8331 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdarg.h>
@@ -105,5 +105,22 @@ string string_strip(const string& s)
 
 }
 
+void string_replace(string& haystack, const string& needle, const string& other)
+{
+	size_t i;
+
+	while((i = haystack.find(needle)) != string::npos)
+		haystack.replace(i, needle.length(), other);
+}
+
+string string_remove_trademark(const string &s)
+{
+	string result = s;
+	string_replace(result, "(TM)", "");
+	string_replace(result, "(R)", "");
+
+	return string_strip(result);
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index 6808f085834..6cb8d8df1e1 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_STRING_H__
@@ -40,8 +40,10 @@ string string_printf(const char *format, ...) PRINTF_ATTRIBUTE;
 
 bool string_iequals(const string& a, const string& b);
 void string_split(vector<string>& tokens, const string& str, const string& separators = "\t ");
+void string_replace(string& haystack, const string& needle, const string& other);
 bool string_endswith(const string& s, const char *end);
 string string_strip(const string& s);
+string string_remove_trademark(const string& s);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 7c0445577e2..cc88320b68e 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -11,11 +11,12 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "util_system.h"
 #include "util_types.h"
+#include "util_string.h"
 
 #ifdef _WIN32
 #if(!defined(FREE_WINDOWS))
@@ -75,14 +76,6 @@ static void __cpuid(int data[4], int selector)
 }
 #endif
 
-static void replace_string(string& haystack, const string& needle, const string& other)
-{
-	size_t i;
-
-	while((i = haystack.find(needle)) != string::npos)
-		haystack.replace(i, needle.length(), other);
-}
-
 string system_cpu_brand_string()
 {
 	char buf[48];
@@ -98,10 +91,7 @@ string system_cpu_brand_string()
 		string brand = buf;
 
 		/* make it a bit more presentable */
-		replace_string(brand, "(TM)", "");
-		replace_string(brand, "(R)", "");
-
-		brand = string_strip(brand);
+		brand = string_remove_trademark(brand);
 
 		return brand;
 	}
@@ -127,6 +117,7 @@ struct CPUCapabilities {
 	bool sse42;
 	bool sse4a;
 	bool avx;
+	bool f16c;
 	bool avx2;
 	bool xop;
 	bool fma3;
@@ -135,24 +126,42 @@ struct CPUCapabilities {
 	bool bmi2;
 };
 
+static void system_cpu_capabilities_override(CPUCapabilities *caps)
+{
+	/* Only capabilities which affects on cycles kernel. */
+	if(getenv("CYCLES_CPU_NO_AVX2")) {
+		caps->avx2 = false;
+	}
+	if(getenv("CYCLES_CPU_NO_AVX")) {
+		caps->avx = false;
+	}
+	if(getenv("CYCLES_CPU_NO_SSE41")) {
+		caps->sse41 = false;
+	}
+	if(getenv("CYCLES_CPU_NO_SSE3")) {
+		caps->sse3 = false;
+	}
+	if(getenv("CYCLES_CPU_NO_SSE2")) {
+		caps->sse2 = false;
+	}
+	if(getenv("CYCLES_CPU_NO_SSE")) {
+		caps->sse = false;
+	}
+}
+
 static CPUCapabilities& system_cpu_capabilities()
 {
 	static CPUCapabilities caps;
 	static bool caps_init = false;
 
 	if(!caps_init) {
-		int result[4], num; //, num_ex;
+		int result[4], num;
 
 		memset(&caps, 0, sizeof(caps));
 
 		__cpuid(result, 0);
 		num = result[0];
 
-#if 0
-		__cpuid(result, 0x80000000);
-		num_ex = result[0];
-#endif
-
 		if(num >= 1) {
 			__cpuid(result, 0x00000001);
 			caps.mmx = (result[3] & ((int)1 << 23)) != 0;
@@ -184,21 +193,15 @@ static CPUCapabilities& system_cpu_capabilities()
 				caps.avx = (xcr_feature_mask & 0x6) == 0x6;
 			}
 
+			caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+
 			__cpuid(result, 0x00000007);
 			caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
 			caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
 			caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
 		}
 
-#if 0
-		if(num_ex >= 0x80000001) {
-			__cpuid(result, 0x80000001);
-			caps.x64 = (result[3] & ((int)1 << 29)) != 0;
-			caps.sse4a = (result[2] & ((int)1 <<  6)) != 0;
-			caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
-			caps.xop = (result[2] & ((int)1 << 11)) != 0;
-		}
-#endif
+		system_cpu_capabilities_override(&caps);
 
 		caps_init = true;
 	}
@@ -232,7 +235,7 @@ bool system_cpu_support_avx()
 bool system_cpu_support_avx2()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
 }
 #else
 
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 0e8868c7dfc..4e7e00f85fd 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_SYSTEM_H__
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 14a81ecbb05..d56553d1d4a 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "util_debug.h"
@@ -237,7 +237,7 @@ bool TaskScheduler::thread_wait_pop(Entry& entry)
 	return true;
 }
 
-void TaskScheduler::thread_run(int thread_id)
+void TaskScheduler::thread_run(int /*thread_id*/)
 {
 	Entry entry;
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index 42a1e2f5a58..debcff3b776 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_TASK_H__
@@ -27,7 +27,7 @@ class Task;
 class TaskPool;
 class TaskScheduler;
 
-typedef boost::function<void(void)> TaskRunFunction;
+typedef function<void(void)> TaskRunFunction;
 
 /* Task
  *
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index f2698d043fb..9c19235d41d 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -11,13 +11,20 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_THREAD_H__
 #define __UTIL_THREAD_H__
 
-#include <boost/thread.hpp>
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <thread>
+#  include <mutex>
+#  include <condition_variable>
+#  include <functional>
+#else
+#  include <boost/thread.hpp>
+#endif
 #include <pthread.h>
 #include <queue>
 
@@ -25,18 +32,24 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+typedef std::mutex thread_mutex;
+typedef std::unique_lock<std::mutex> thread_scoped_lock;
+typedef std::condition_variable thread_condition_variable;
+#else
 /* use boost for mutexes */
-
 typedef boost::mutex thread_mutex;
 typedef boost::mutex::scoped_lock thread_scoped_lock;
 typedef boost::condition_variable thread_condition_variable;
+#endif
 
 /* own pthread based implementation, to avoid boost version conflicts with
  * dynamically loaded blender plugins */
 
 class thread {
 public:
-	thread(boost::function<void(void)> run_cb_)
+	thread(function<void(void)> run_cb_)
+
 	{
 		joined = false;
 		run_cb = run_cb_;
@@ -63,7 +76,7 @@ public:
 	}
 
 protected:
-	boost::function<void(void)> run_cb;
+	function<void(void)> run_cb;
 	pthread_t pthread_id;
 	bool joined;
 };
diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp
index dd91b024940..964f9f1a7af 100644
--- a/intern/cycles/util/util_time.cpp
+++ b/intern/cycles/util/util_time.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdlib.h>
@@ -71,7 +71,7 @@ void time_sleep(double t)
 
 	/* get microseconds */
 	int us = (int)(t * 1e6);
-	if (us > 0)
+	if(us > 0)
 		usleep(us);
 }
 
diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h
index 3df17272e2f..14ffea7f3da 100644
--- a/intern/cycles/util/util_time.h
+++ b/intern/cycles/util/util_time.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_TIME_H__
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 14613558501..acaca69464c 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /*
@@ -46,9 +46,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_math.h"
 #include "util_transform.h"
 
+#include "util_boundbox.h"
+#include "util_math.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Transform Inverse */
@@ -271,5 +273,15 @@ void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTrans
 	decomp->post_y = post.y;
 }
 
-CCL_NAMESPACE_END
+Transform transform_from_viewplane(BoundBox2D& viewplane)
+{
+	return
+		transform_scale(1.0f / (viewplane.right - viewplane.left),
+		                1.0f / (viewplane.top - viewplane.bottom),
+		                1.0f) *
+		transform_translate(-viewplane.left,
+		                    -viewplane.bottom,
+		                    0.0f);
+}
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index 5b3dbe42f69..ba8d04b5c16 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_TRANSFORM_H__
@@ -55,6 +55,11 @@ typedef struct DecompMotionTransform {
 	float4 post_x, post_y;
 } DecompMotionTransform;
 
+typedef struct PerspectiveMotionTransform {
+	Transform pre;
+	Transform post;
+} PerspectiveMotionTransform;
+
 /* Functions */
 
 ccl_device_inline float3 transform_perspective(const Transform *t, const float3 a)
@@ -216,12 +221,13 @@ ccl_device_inline Transform transform_rotate(float angle, float3 axis)
 		0.0f, 0.0f, 0.0f, 1.0f);
 }
 
+/* Euler is assumed to be in XYZ order. */
 ccl_device_inline Transform transform_euler(float3 euler)
 {
 	return
-		transform_rotate(euler.x, make_float3(1.0f, 0.0f, 0.0f)) *
+		transform_rotate(euler.z, make_float3(0.0f, 0.0f, 1.0f)) *
 		transform_rotate(euler.y, make_float3(0.0f, 1.0f, 0.0f)) *
-		transform_rotate(euler.z, make_float3(0.0f, 0.0f, 1.0f));
+		transform_rotate(euler.x, make_float3(1.0f, 0.0f, 0.0f));
 }
 
 ccl_device_inline Transform transform_orthographic(float znear, float zfar)
@@ -448,6 +454,8 @@ ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionT
 
 #ifndef __KERNEL_GPU__
 
+class BoundBox2D;
+
 ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransform& B)
 {
 	return (A.pre == B.pre && A.post == B.post);
@@ -455,9 +463,41 @@ ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransfor
 
 float4 transform_to_quat(const Transform& tfm);
 void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid);
+Transform transform_from_viewplane(BoundBox2D& viewplane);
 
 #endif
 
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+
+#define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a ## b
+#define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
+ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
+    ccl_addr_space const Transform *t, const float3 a) \
+{ \
+  Transform private_tfm = *t; \
+  return function(&private_tfm, a); \
+}
+
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
+
+#  undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
+#  undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
+#  define transform_point_auto transform_point_addrspace
+#  define transform_direction_auto transform_direction_addrspace
+#  define transform_direction_transposed_auto transform_direction_transposed_addrspace
+#else
+#  define transform_point_auto transform_point
+#  define transform_direction_auto transform_direction
+#  define transform_direction_transposed_auto transform_direction_transposed
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TRANSFORM_H__ */
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 2a199e591bf..6f474f873a6 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_TYPES_H__
@@ -33,11 +33,7 @@
 
 #ifndef __KERNEL_GPU__
 
-#  ifdef NDEBUG
-#    define ccl_device static inline
-#  else
-#    define ccl_device static
-#  endif
+#define ccl_device static inline
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
@@ -53,11 +49,7 @@
 #define ccl_try_align(...) /* not support for function arguments (error C2719) */
 #endif
 #define ccl_may_alias
-#  ifdef NDEBUG
-#    define ccl_always_inline __forceinline
-#  else
-#    define ccl_always_inline
-#  endif
+#define ccl_always_inline __forceinline
 #define ccl_maybe_unused
 
 #else
@@ -272,6 +264,19 @@ struct ccl_try_align(16) float4 {
 	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
+template<typename T>
+class vector3
+{
+public:
+	T x, y, z;
+
+	ccl_always_inline vector3() {}
+	ccl_always_inline vector3(const T& a)
+	  : x(a), y(a), z(a) {}
+	ccl_always_inline vector3(const T& x, const T& y, const T& z)
+	  : x(x), y(y), z(z) {}
+};
+
 #endif
 
 #ifndef __KERNEL_GPU__
@@ -465,6 +470,19 @@ enum InterpolationType {
 	INTERPOLATION_SMART = 3,
 };
 
+/* Extension types for textures.
+ *
+ * Defines how the image is extrapolated past its original bounds.
+ */
+enum ExtensionType {
+	/* Cause the image to repeat horizontally and vertically. */
+	EXTENSION_REPEAT = 0,
+	/* Extend by repeating edge pixels of the image. */
+	EXTENSION_EXTEND = 1,
+	/* Clip to image size and set exterior pixels as transparent. */
+	EXTENSION_CLIP = 2,
+};
+
 /* macros */
 
 /* hints for branch prediction, only use in code that runs a _lot_ */
@@ -476,18 +494,32 @@ enum InterpolationType {
 #  define UNLIKELY(x)     (x)
 #endif
 
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
 /* Causes warning:
  * incompatible types when assigning to type 'Foo' from type 'Bar'
  * ... the compiler optimizes away the temp var */
 #ifdef __GNUC__
 #define CHECK_TYPE(var, type)  {  \
-	typeof(var) *__tmp;         \
+	TYPEOF(var) *__tmp;         \
 	__tmp = (type *)NULL;         \
 	(void)__tmp;                  \
 } (void)0
 
 #define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-	typeof(var_a) *__tmp;                 \
+	TYPEOF(var_a) *__tmp;                 \
 	__tmp = (typeof(var_b) *)NULL;        \
 	(void)__tmp;                          \
 } (void)0
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index cc6e8a371ed..ee1f997721d 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_VECTOR_H__
@@ -19,34 +19,74 @@
 
 /* Vector */
 
-#include <string.h>
+#include <cassert>
+#include <cstring>
 #include <vector>
 
+#include "util_aligned_malloc.h"
 #include "util_types.h"
 
-CCL_NAMESPACE_BEGIN
+#ifdef WITH_CYCLES_DEBUG
+#  include "util_guarded_allocator.h"
+#endif
 
-using std::vector;
+CCL_NAMESPACE_BEGIN
 
-static inline void *malloc_aligned(size_t size, size_t alignment)
+/* Vector
+ *
+ * Own subclass-ed vestion of std::vector. Subclass is needed because:
+ *
+ * - When building with WITH_CYCLES_DEBUG we need to use own allocator which
+ *   keeps track of used/peak memory.
+ *
+ * - Have method to ensure capacity is re-set to 0.
+ */
+template<typename value_type,
+#ifdef WITH_CYCLES_DEBUG
+         typename allocator_type = GuardedAllocator<value_type>
+#else
+         typename allocator_type = std::allocator<value_type>
+#endif
+        >
+class vector : public std::vector<value_type, allocator_type>
 {
-	void *data = (void*)malloc(size + sizeof(void*) + alignment - 1);
+public:
+	/* Default constructor. */
+	explicit vector() : std::vector<value_type, allocator_type>() {  }
 
-	union { void *ptr; size_t offset; } u;
-	u.ptr = (char*)data + sizeof(void*);
-	u.offset = (u.offset + alignment - 1) & ~(alignment - 1);
-	*(((void**)u.ptr) - 1) = data;
+	/* Fill constructor. */
+	explicit vector(size_t n, const value_type& val = value_type())
+		: std::vector<value_type, allocator_type>(n, val) {  }
 
-	return u.ptr;
-}
+	/* Range constructor. */
+	template <class InputIterator>
+	vector(InputIterator first, InputIterator last)
+		: std::vector<value_type, allocator_type>(first, last) {  }
 
-static inline void free_aligned(void *ptr)
-{
-	if(ptr) {
-		void *data = *(((void**)ptr) - 1);
-		free(data);
+	/* Copy constructor. */
+	vector(const vector &x) : std::vector<value_type, allocator_type>(x) {  }
+
+	void shrink_to_fit(void)
+	{
+#if __cplusplus < 201103L
+		vector<value_type>().swap(*this);
+#else
+		std::vector<value_type, allocator_type>::shrink_to_fit();
+#endif
 	}
-}
+
+	void free_memory(void)
+	{
+		std::vector<value_type, allocator_type>::resize(0);
+		shrink_to_fit();
+	}
+
+	/* Some external API might demand working with std::vector. */
+	operator std::vector<value_type>()
+	{
+		return std::vector<value_type>(*this);
+	}
+};
 
 /* Array
  *
@@ -65,6 +105,7 @@ public:
 	{
 		data = NULL;
 		datasize = 0;
+		capacity = 0;
 	}
 
 	array(size_t newsize)
@@ -72,10 +113,12 @@ public:
 		if(newsize == 0) {
 			data = NULL;
 			datasize = 0;
+			capacity = 0;
 		}
 		else {
-			data = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
+			data = (T*)util_aligned_malloc(sizeof(T)*newsize, alignment);
 			datasize = newsize;
+			capacity = datasize;
 		}
 	}
 
@@ -89,11 +132,13 @@ public:
 		if(from.datasize == 0) {
 			data = NULL;
 			datasize = 0;
+			capacity = 0;
 		}
 		else {
-			data = (T*)malloc_aligned(sizeof(T)*from.datasize, alignment);
+			data = (T*)util_aligned_malloc(sizeof(T)*from.datasize, alignment);
 			memcpy(data, from.data, from.datasize*sizeof(T));
 			datasize = from.datasize;
+			capacity = datasize;
 		}
 
 		return *this;
@@ -102,13 +147,11 @@ public:
 	array& operator=(const vector<T>& from)
 	{
 		datasize = from.size();
+		capacity = datasize;
 		data = NULL;
 
 		if(datasize > 0) {
-			data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
-			memcpy(data, &from[0], datasize*sizeof(T));
-			free_aligned(data);
-			data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
+			data = (T*)util_aligned_malloc(sizeof(T)*datasize, alignment);
 			memcpy(data, &from[0], datasize*sizeof(T));
 		}
 
@@ -117,7 +160,7 @@ public:
 
 	~array()
 	{
-		free_aligned(data);
+		util_aligned_free(data);
 	}
 
 	void resize(size_t newsize)
@@ -126,22 +169,25 @@ public:
 			clear();
 		}
 		else if(newsize != datasize) {
-			T *newdata = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
-			if(data) {
-				memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
-				free_aligned(data);
+			if(newsize > capacity) {
+				T *newdata = (T*)util_aligned_malloc(sizeof(T)*newsize, alignment);
+				if(data) {
+					memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
+					util_aligned_free(data);
+				}
+				data = newdata;
+				capacity = newsize;
 			}
-
-			data = newdata;
 			datasize = newsize;
 		}
 	}
 
 	void clear()
 	{
-		free_aligned(data);
+		util_aligned_free(data);
 		data = NULL;
 		datasize = 0;
+		capacity = 0;
 	}
 
 	size_t size() const
@@ -151,12 +197,26 @@ public:
 
 	T& operator[](size_t i) const
 	{
+		assert(i < datasize);
 		return data[i];
 	}
 
+	void reserve(size_t newcapacity) {
+		if(newcapacity > capacity) {
+			T *newdata = (T*)util_aligned_malloc(sizeof(T)*newcapacity, alignment);
+			if(data) {
+				memcpy(newdata, data, ((datasize < newcapacity)? datasize: newcapacity)*sizeof(T));
+				util_aligned_free(data);
+			}
+			data = newdata;
+			capacity = newcapacity;
+		}
+	}
+
 protected:
 	T *data;
 	size_t datasize;
+	size_t capacity;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 6bf9c9ed8c0..9b5cd22fb4a 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include <stdio.h>
@@ -98,7 +98,7 @@ void view_display_help()
 	glColor3f(0.8f, 0.8f, 0.8f);
 
 	view_display_text(x1+20, y2-20, "Cycles Renderer");
-	view_display_text(x1+20, y2-40, "(C) 2011-2014 Blender Foundation");
+	view_display_text(x1+20, y2-40, "(C) 2011-2015 Blender Foundation");
 	view_display_text(x1+20, y2-80, "Controls:");
 	view_display_text(x1+20, y2-100, "h:  Info/Help");
 	view_display_text(x1+20, y2-120, "r:  Reset");
@@ -110,6 +110,7 @@ void view_display_help()
 	view_display_text(x1+20, y2-230, "Left mouse:  Move camera");
 	view_display_text(x1+20, y2-250, "Right mouse:  Rotate camera");
 	view_display_text(x1+20, y2-270, "W/A/S/D:  Move camera");
+	view_display_text(x1+20, y2-290, "0/1/2/3:  Set max bounces");
 
 	glColor3f(1.0f, 1.0f, 1.0f);
 }
@@ -248,7 +249,7 @@ void view_main_loop(const char *title, int width, int height,
 	glutInitDisplayMode(GLUT_RGB|GLUT_DOUBLE|GLUT_DEPTH);
 	glutCreateWindow(title);
 
-	glewInit();
+	mxMakeCurrentContext(mxCreateContext());
 
 	view_reshape(width, height);
 
diff --git a/intern/cycles/util/util_view.h b/intern/cycles/util/util_view.h
index 65d890eb6af..5def0564175 100644
--- a/intern/cycles/util/util_view.h
+++ b/intern/cycles/util/util_view.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_VIEW_H__
diff --git a/intern/cycles/util/util_xml.h b/intern/cycles/util/util_xml.h
index 9d1ebc2114a..cfd0afc95f7 100644
--- a/intern/cycles/util/util_xml.h
+++ b/intern/cycles/util/util_xml.h
@@ -11,15 +11,15 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __UTIL_XML_H__
 #define __UTIL_XML_H__
 
-/* PugiXML from OpenImageIO is used for XML parsing. */
+/* PugiXML is used for XML parsing. */
 
-#include <OpenImageIO/pugixml.hpp>
+#include <pugixml.hpp>
 
 CCL_NAMESPACE_BEGIN