242 files changed, 12759 insertions, 4403 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 2ba6af48d0d..ed6961f49e0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -64,7 +64,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
 	endif()
 	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mfpmath=sse")
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -80,7 +80,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
 	endif()
 	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
@@ -152,6 +152,27 @@ add_definitions(
 	-DWITH_MULTI
 )
 
+TEST_UNORDERED_MAP_SUPPORT()
+if(HAVE_STD_UNORDERED_MAP_HEADER)
+    if(HAVE_UNORDERED_MAP_IN_STD_NAMESPACE)
+        add_definitions(-DCYCLES_STD_UNORDERED_MAP)
+    else()
+        if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
+            add_definitions(-DCYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
+        else()
+            add_definitions(-DCYCLES_NO_UNORDERED_MAP)
+            message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
+        endif()
+    endif()
+else()
+    if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
+        add_definitions(-DCYCLES_TR1_UNORDERED_MAP)
+    else()
+        add_definitions(-DCYCLES_NO_UNORDERED_MAP)
+        message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
+    endif()
+endif()
+
 # Logging capabilities using GLog library.
 if(WITH_CYCLES_LOGGING)
 	add_definitions(-DWITH_CYCLES_LOGGING)
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 9cbdb93ce85..99df8c299fc 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -34,12 +34,8 @@ cycles.Depends('../../source/blender/makesrna/intern/RNA_blender_cpp.h', 'makesr
 
 sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('kernel/*.cpp') + cycles.Glob('render/*.cpp') + cycles.Glob('subd/*.cpp') + cycles.Glob('util/*.cpp') + cycles.Glob('blender/*.cpp')
 
+sources.append(path.join('kernel', 'kernels', 'cpu', 'kernel.cpp'))
 sources.remove(path.join('util', 'util_view.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
-sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
-sources.remove(path.join('kernel', 'kernel_avx.cpp'))
-sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
 
 incs = [] 
 defs = []
@@ -47,6 +43,18 @@ cxxflags = Split(env['CXXFLAGS'])
 
 defs += env['BF_GL_DEFINITIONS']
 
+if env['WITH_UNORDERED_MAP_SUPPORT']:
+    if env['UNORDERED_MAP_HEADER'] == 'unordered_map':
+        if env['UNORDERED_MAP_NAMESPACE'] == 'std':
+            defs.append('CYCLES_STD_UNORDERED_MAP')
+        elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+            defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE')
+    elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+        defs.append('CYCLES_TR1_UNORDERED_MAP')
+else:
+    print("-- Replacing unordered_map/set with map/set (warning: slower!)")
+    defs.append('CYCLES_NO_UNORDERED_MAP')
+
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
 
@@ -128,13 +136,13 @@ else:
 
     if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
         kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
-        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2'
+        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c'
 
 for kernel_type in kernel_flags.keys():
     defs.append('WITH_KERNEL_' + kernel_type.upper())
 
 for kernel_type in kernel_flags.keys():
-    kernel_source = path.join('kernel', 'kernel_' + kernel_type + '.cpp')
+    kernel_source = path.join('kernel', 'kernels', 'cpu', 'kernel_' + kernel_type + '.cpp')
     kernel_cxxflags = Split(env['CXXFLAGS'])
     kernel_cxxflags.append(kernel_flags[kernel_type].split())
     kernel_defs = defs[:]
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index 3d5b237eec1..4ef9cd070bb 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -24,6 +24,7 @@
 #include "util_stats.h"
 #include "util_string.h"
 #include "util_task.h"
+#include "util_logging.h"
 
 using namespace ccl;
 
@@ -66,7 +67,7 @@ int main(int argc, const char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	if (debug) {
+	if(debug) {
 		util_logging_start();
 		util_logging_verbosity_set(verbosity);
 	}
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 42c339f56e5..b0d49d6ee72 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -21,6 +21,7 @@
 #include "device.h"
 #include "scene.h"
 #include "session.h"
+#include "integrator.h"
 
 #include "util_args.h"
 #include "util_foreach.h"
@@ -124,7 +125,7 @@ static void scene_init()
 	xml_read_file(options.scene, options.filepath.c_str());
 
 	/* Camera width/height override? */
-	if (!(options.width == 0 || options.height == 0)) {
+	if(!(options.width == 0 || options.height == 0)) {
 		options.scene->camera->width = options.width;
 		options.scene->camera->height = options.height;
 	}
@@ -272,6 +273,7 @@ static void keyboard(unsigned char key)
 	else if(key == 'i')
 		options.interactive = !(options.interactive);
 
+	/* Navigation */
 	else if(options.interactive && (key == 'w' || key == 'a' || key == 's' || key == 'd')) {
 		Transform matrix = options.session->scene->camera->matrix;
 		float3 translate;
@@ -294,6 +296,25 @@ static void keyboard(unsigned char key)
 
 		options.session->reset(session_buffer_params(), options.session_params.samples);
 	}
+
+	/* Set Max Bounces */
+	else if(options.interactive && (key == '0' || key == '1' || key == '2' || key == '3')) {
+		int bounce;
+		switch(key) {
+			case '0': bounce = 0; break;
+			case '1': bounce = 1; break;
+			case '2': bounce = 2; break;
+			case '3': bounce = 3; break;
+			default: bounce = 0; break;
+		}
+
+		options.session->scene->integrator->max_bounce = bounce;
+
+		/* Update and Reset */
+		options.session->scene->integrator->need_update = true;
+
+		options.session->reset(session_buffer_params(), options.session_params.samples);
+	}
 }
 #endif
 
@@ -367,7 +388,7 @@ static void options_parse(int argc, const char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	if (debug) {
+	if(debug) {
 		util_logging_start();
 		util_logging_verbosity_set(verbosity);
 	}
@@ -377,7 +398,8 @@ static void options_parse(int argc, const char **argv)
 		printf("Devices:\n");
 
 		foreach(DeviceInfo& info, devices) {
-			printf("    %s%s\n",
+			printf("    %-10s%s%s\n",
+				Device::string_from_type(info.type).c_str(),
 				info.description.c_str(),
 				(info.display_device)? " (display)": "");
 		}
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 05e34387eb7..edea8cd0ec4 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <iterator>
 
+#include "background.h"
 #include "camera.h"
 #include "film.h"
 #include "graph.h"
@@ -55,6 +56,16 @@ struct XMLReadState {
 	string base;		/* base path to current file*/
 	float dicing_rate;	/* current dicing rate */
 	Mesh::DisplacementMethod displacement_method;
+
+	XMLReadState()
+	  : scene(NULL),
+	    smooth(false),
+	    shader(0),
+	    dicing_rate(0.0f),
+	    displacement_method(Mesh::DISPLACE_BUMP)
+	{
+		tfm = transform_identity();
+	}
 };
 
 /* Attribute Reading */
@@ -225,21 +236,21 @@ static ShaderSocketType xml_read_socket_type(pugi::xml_node node, const char *na
 
 	if(attr) {
 		string value = attr.value();
-		if (string_iequals(value, "float"))
+		if(string_iequals(value, "float"))
 			return SHADER_SOCKET_FLOAT;
-		else if (string_iequals(value, "int"))
+		else if(string_iequals(value, "int"))
 			return SHADER_SOCKET_INT;
-		else if (string_iequals(value, "color"))
+		else if(string_iequals(value, "color"))
 			return SHADER_SOCKET_COLOR;
-		else if (string_iequals(value, "vector"))
+		else if(string_iequals(value, "vector"))
 			return SHADER_SOCKET_VECTOR;
-		else if (string_iequals(value, "point"))
+		else if(string_iequals(value, "point"))
 			return SHADER_SOCKET_POINT;
-		else if (string_iequals(value, "normal"))
+		else if(string_iequals(value, "normal"))
 			return SHADER_SOCKET_NORMAL;
-		else if (string_iequals(value, "closure color"))
+		else if(string_iequals(value, "closure color"))
 			return SHADER_SOCKET_CLOSURE;
-		else if (string_iequals(value, "string"))
+		else if(string_iequals(value, "string"))
 			return SHADER_SOCKET_STRING;
 		else
 			fprintf(stderr, "Unknown shader socket type \"%s\" for attribute \"%s\".\n", value.c_str(), name);
@@ -381,6 +392,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 	for(pugi::xml_node node = graph_node.first_child(); node; node = node.next_sibling()) {
 		ShaderNode *snode = NULL;
 
+		/* ToDo: Add missing nodes
+		 * RGBCurvesNode, VectorCurvesNode, RGBRampNode and ConvertNode (RGB -> BW).
+		 */
+
 		if(string_iequals(node.name(), "image_texture")) {
 			ImageTextureNode *img = new ImageTextureNode();
 
@@ -391,6 +406,8 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_enum(&img->projection, ImageTextureNode::projection_enum, node, "projection");
 			xml_read_float(&img->projection_blend, node, "projection_blend");
 
+			/* ToDo: Interpolation */
+
 			snode = img;
 		}
 		else if(string_iequals(node.name(), "environment_texture")) {
@@ -419,25 +436,25 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			 * Socket names must be stored in the extra lists instead. */
 			/* read input values */
 			for(pugi::xml_node param = node.first_child(); param; param = param.next_sibling()) {
-				if (string_iequals(param.name(), "input")) {
+				if(string_iequals(param.name(), "input")) {
 					string name;
-					if (!xml_read_string(&name, param, "name"))
+					if(!xml_read_string(&name, param, "name"))
 						continue;
 					
 					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if (type == SHADER_SOCKET_UNDEFINED)
+					if(type == SHADER_SOCKET_UNDEFINED)
 						continue;
 					
 					osl->input_names.push_back(ustring(name));
 					osl->add_input(osl->input_names.back().c_str(), type);
 				}
-				else if (string_iequals(param.name(), "output")) {
+				else if(string_iequals(param.name(), "output")) {
 					string name;
-					if (!xml_read_string(&name, param, "name"))
+					if(!xml_read_string(&name, param, "name"))
 						continue;
 					
 					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if (type == SHADER_SOCKET_UNDEFINED)
+					if(type == SHADER_SOCKET_UNDEFINED)
 						continue;
 					
 					osl->output_names.push_back(ustring(name));
@@ -493,10 +510,6 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_int(&magic->depth, node, "depth");
 			snode = magic;
 		}
-		else if(string_iequals(node.name(), "noise_texture")) {
-			NoiseTextureNode *dist = new NoiseTextureNode();
-			snode = dist;
-		}
 		else if(string_iequals(node.name(), "wave_texture")) {
 			WaveTextureNode *wave = new WaveTextureNode();
 			xml_read_enum(&wave->type, WaveTextureNode::type_enum, node, "type");
@@ -507,6 +520,11 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_float3(&normal->direction, node, "direction");
 			snode = normal;
 		}
+		else if(string_iequals(node.name(), "bump")) {
+			BumpNode *bump = new BumpNode();
+			xml_read_bool(&bump->invert, node, "invert");
+			snode = bump;
+		}
 		else if(string_iequals(node.name(), "mapping")) {
 			snode = new MappingNode();
 		}
@@ -561,6 +579,9 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		else if(string_iequals(node.name(), "background")) {
 			snode = new BackgroundNode();
 		}
+		else if(string_iequals(node.name(), "holdout")) {
+			snode = new HoldoutNode();
+		}
 		else if(string_iequals(node.name(), "absorption_volume")) {
 			snode = new AbsorptionVolumeNode();
 		}
@@ -569,7 +590,14 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		}
 		else if(string_iequals(node.name(), "subsurface_scattering")) {
 			SubsurfaceScatteringNode *sss = new SubsurfaceScatteringNode();
-			//xml_read_enum(&sss->falloff, SubsurfaceScatteringNode::falloff_enum, node, "falloff");
+
+			string falloff;
+			xml_read_string(&falloff, node, "falloff");
+			if(falloff == "cubic")
+				sss->closure = CLOSURE_BSSRDF_CUBIC_ID;
+			else
+				sss->closure = CLOSURE_BSSRDF_GAUSSIAN_ID;
+
 			snode = sss;
 		}
 		else if(string_iequals(node.name(), "geometry")) {
@@ -613,6 +641,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			snode = new InvertNode();
 		}
 		else if(string_iequals(node.name(), "mix")) {
+			/* ToDo: Tag Mix case for optimization */
 			MixNode *mix = new MixNode();
 			xml_read_enum(&mix->type, MixNode::type_enum, node, "type");
 			xml_read_bool(&mix->use_clamp, node, "use_clamp");
@@ -637,10 +666,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			snode = new SeparateHSVNode();
 		}
 		else if(string_iequals(node.name(), "combine_xyz")) {
-			snode = new CombineHSVNode();
+			snode = new CombineXYZNode();
 		}
 		else if(string_iequals(node.name(), "separate_xyz")) {
-			snode = new SeparateHSVNode();
+			snode = new SeparateXYZNode();
 		}
 		else if(string_iequals(node.name(), "hsv")) {
 			snode = new HSVNode();
@@ -822,6 +851,15 @@ static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
 
 static void xml_read_background(const XMLReadState& state, pugi::xml_node node)
 {
+	/* Background Settings */
+	Background *bg = state.scene->background;
+
+	xml_read_float(&bg->ao_distance, node, "ao_distance");
+	xml_read_float(&bg->ao_factor, node, "ao_factor");
+
+	xml_read_bool(&bg->transparent, node, "transparent");
+
+	/* Background Shader */
 	Shader *shader = state.scene->shaders[state.scene->default_background];
 	
 	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
@@ -868,6 +906,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 
 	/* read vertices and polygons, RIB style */
 	vector<float3> P;
+	vector<float> UV;
 	vector<int> verts, nverts;
 
 	xml_read_float3_array(P, node, "P");
@@ -939,6 +978,31 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 
 			index_offset += nverts[i];
 		}
+
+		if(xml_read_float_array(UV, node, "UV")) {
+			ustring name = ustring("UVMap");
+			Attribute *attr = mesh->attributes.add(ATTR_STD_UV, name);
+			float3 *fdata = attr->data_float3();
+
+			/* loop over the triangles */
+			index_offset = 0;
+			for(size_t i = 0; i < nverts.size(); i++) {
+				for(int j = 0; j < nverts[i]-2; j++) {
+					int v0 = verts[index_offset];
+					int v1 = verts[index_offset + j + 1];
+					int v2 = verts[index_offset + j + 2];
+
+					assert(v0*2+1 < (int)UV.size());
+					assert(v1*2+1 < (int)UV.size());
+					assert(v2*2+1 < (int)UV.size());
+
+					fdata[0] = make_float3(UV[v0*2], UV[v0*2+1], 0.0);
+					fdata[1] = make_float3(UV[v1*2], UV[v1*2+1], 0.0);
+					fdata[2] = make_float3(UV[v2*2], UV[v2*2+1], 0.0);
+					fdata += 3;
+				}
+			}
+		}
 	}
 
 	/* temporary for test compatibility */
@@ -1028,13 +1092,28 @@ static void xml_read_light(const XMLReadState& state, pugi::xml_node node)
 	xml_read_float(&light->sizev, node, "sizev");
 	xml_read_float3(&light->axisu, node, "axisu");
 	xml_read_float3(&light->axisv, node, "axisv");
-	
+
+	/* Portal? (Area light only) */
+	xml_read_bool(&light->is_portal, node, "is_portal");
+
 	/* Generic */
 	xml_read_float(&light->size, node, "size");
 	xml_read_float3(&light->dir, node, "dir");
 	xml_read_float3(&light->co, node, "P");
 	light->co = transform_point(&state.tfm, light->co);
 
+	/* Settings */
+	xml_read_bool(&light->cast_shadow, node, "cast_shadow");
+	xml_read_bool(&light->use_mis, node, "use_mis");
+	xml_read_int(&light->samples, node, "samples");
+	xml_read_int(&light->max_bounces, node, "max_bounces");
+
+	/* Ray Visibility */
+	xml_read_bool(&light->use_diffuse, node, "use_diffuse");
+	xml_read_bool(&light->use_glossy, node, "use_glossy");
+	xml_read_bool(&light->use_transmission, node, "use_transmission");
+	xml_read_bool(&light->use_scatter, node, "use_scatter");
+
 	state.scene->lights.push_back(light);
 }
 
@@ -1178,7 +1257,8 @@ static void xml_read_include(const XMLReadState& state, const string& src)
 		XMLReadState substate = state;
 		substate.base = path_dirname(path);
 
-		xml_read_scene(substate, doc);
+		pugi::xml_node cycles = doc.child("cycles");
+		xml_read_scene(substate, cycles);
 	}
 	else {
 		fprintf(stderr, "%s read error: %s\n", src.c_str(), parse_result.description());
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 5b0c6a84bb5..0783c1c4cba 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -30,8 +30,10 @@ bl_info = {
 
 import bpy
 
-from . import engine
-from . import version_update
+from . import (
+        engine,
+        version_update,
+        )
 
 
 class CyclesRender(bpy.types.RenderEngine):
@@ -65,8 +67,8 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, scene):
         engine.render(self)
 
-    def bake(self, scene, obj, pass_type, pixel_array, num_pixels, depth, result):
-        engine.bake(self, obj, pass_type, pixel_array, num_pixels, depth, result)
+    def bake(self, scene, obj, pass_type, object_id, pixel_array, num_pixels, depth, result):
+        engine.bake(self, obj, pass_type, object_id, pixel_array, num_pixels, depth, result)
 
     # viewport render
     def view_update(self, context):
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e50a8e45b52..4187e2381ac 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -59,11 +59,11 @@ def render(engine):
         _cycles.render(engine.session)
 
 
-def bake(engine, obj, pass_type, pixel_array, num_pixels, depth, result):
+def bake(engine, obj, pass_type, object_id, pixel_array, num_pixels, depth, result):
     import _cycles
     session = getattr(engine, "session", None)
     if session is not None:
-        _cycles.bake(engine.session, obj.as_pointer(), pass_type, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
+        _cycles.bake(engine.session, obj.as_pointer(), pass_type, object_id, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
 
 
 def reset(engine, data, scene):
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index a86f162ae5f..16a807b3af5 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -66,6 +66,7 @@ enum_panorama_types = (
     ('FISHEYE_EQUIDISTANT', "Fisheye Equidistant", "Ideal for fulldomes, ignore the sensor dimensions"),
     ('FISHEYE_EQUISOLID', "Fisheye Equisolid",
                           "Similar to most fisheye modern lens, takes sensor dimensions into consideration"),
+    ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
     )
 
 enum_curve_primitives = (
@@ -393,6 +394,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=0,
                 )
 
+        cls.use_animated_seed = BoolProperty(
+                name="Use Animated Seed",
+                description="Use different seed values (and hence noise patterns) at different frames",
+                default=False,
+                )
+
         cls.sample_clamp_direct = FloatProperty(
                 name="Clamp Direct",
                 description="If non-zero, the maximum value for a direct sample, "
@@ -693,6 +700,12 @@ class CyclesLampSettings(bpy.types.PropertyGroup):
                             "reduces noise for area lamps and sharp glossy materials",
                 default=False,
                 )
+        cls.is_portal = BoolProperty(
+                name="Is Portal",
+                description="Use this area lamp to guide sampling of the background, "
+                            "note that this will make the lamp invisible",
+                default=False,
+                )
 
     @classmethod
     def unregister(cls):
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 63518d7fdb6..f6ff86ac30e 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -18,7 +18,11 @@
 
 import bpy
 
-from bpy.types import Panel, Menu, Operator
+from bpy.types import (
+        Panel,
+        Menu,
+        Operator,
+        )
 
 
 class CYCLES_MT_sampling_presets(Menu):
@@ -56,7 +60,15 @@ def use_cpu(context):
     return (device_type == 'NONE' or cscene.device == 'CPU')
 
 
-def draw_samples_info(layout, cscene):
+def use_branched_path(context):
+    cscene = context.scene.cycles
+    device_type = context.user_preferences.system.compute_device_type
+
+    return (cscene.progressive == 'BRANCHED_PATH' and device_type != 'OPENCL')
+
+
+def draw_samples_info(layout, context):
+    cscene = context.scene.cycles
     integrator = cscene.progressive
 
     # Calculate sample values
@@ -86,7 +98,7 @@ def draw_samples_info(layout, cscene):
 
     # Draw interface
     # Do not draw for progressive, when Square Samples are disabled
-    if (integrator == 'BRANCHED_PATH') or (cscene.use_square_samples and integrator == 'PATH'):
+    if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
         col = layout.column(align=True)
         col.scale_y = 0.6
         col.label("Total Samples:")
@@ -110,6 +122,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         scene = context.scene
         cscene = scene.cycles
+        device_type = context.user_preferences.system.compute_device_type
 
         row = layout.row(align=True)
         row.menu("CYCLES_MT_sampling_presets", text=bpy.types.CYCLES_MT_sampling_presets.bl_label)
@@ -117,7 +130,9 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         row.operator("render.cycles_sampling_preset_add", text="", icon="ZOOMOUT").remove_active = True
 
         row = layout.row()
-        row.prop(cscene, "progressive", text="")
+        sub = row.row()
+        sub.active = device_type != 'OPENCL'
+        sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
         split = layout.split()
@@ -125,11 +140,15 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         col = split.column()
         sub = col.column(align=True)
         sub.label("Settings:")
-        sub.prop(cscene, "seed")
+
+        seed_sub = sub.row(align=True)
+        seed_sub.prop(cscene, "seed")
+        seed_sub.prop(cscene, "use_animated_seed", text="", icon="TIME")
+
         sub.prop(cscene, "sample_clamp_direct")
         sub.prop(cscene, "sample_clamp_indirect")
 
-        if cscene.progressive == 'PATH':
+        if cscene.progressive == 'PATH' or use_branched_path(context) == False:
             col = split.column()
             sub = col.column(align=True)
             sub.label(text="Samples:")
@@ -163,7 +182,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
                 layout.row().prop(cscene, "use_layer_samples")
                 break
 
-        draw_samples_info(layout, cscene)
+        draw_samples_info(layout, context)
 
 
 class CyclesRender_PT_volume_sampling(CyclesButtonsPanel, Panel):
@@ -412,6 +431,51 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
+        if hasattr(rd, "debug_pass_type"):
+            layout.prop(rd, "debug_pass_type")
+
+
+class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
+    bl_label = "Views"
+    bl_context = "render_layer"
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        self.layout.prop(rd, "use_multiview", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        rd = scene.render
+        rv = rd.views.active
+
+        layout.active = rd.use_multiview
+        basic_stereo = (rd.views_format == 'STEREO_3D')
+
+        row = layout.row()
+        row.prop(rd, "views_format", expand=True)
+
+        if basic_stereo:
+            row = layout.row()
+            row.template_list("RENDERLAYER_UL_renderviews", "name", rd, "stereo_views", rd.views, "active_index", rows=2)
+
+            row = layout.row()
+            row.label(text="File Suffix:")
+            row.prop(rv, "file_suffix", text="")
+
+        else:
+            row = layout.row()
+            row.template_list("RENDERLAYER_UL_renderviews", "name", rd, "views", rd.views, "active_index", rows=2)
+
+            col = row.column(align=True)
+            col.operator("scene.render_view_add", icon='ZOOMIN', text="")
+            col.operator("scene.render_view_remove", icon='ZOOMOUT', text="")
+
+            row = layout.row()
+            row.label(text="Camera Suffix:")
+            row.prop(rv, "camera_suffix", text="")
+
 
 class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
@@ -456,7 +520,16 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
         sub = col.row()
         sub.active = cam.dof_object is None
         sub.prop(cam, "dof_distance", text="Distance")
-        col.prop(dof_options, "fstop")
+
+        hq_support = dof_options.is_hq_supported
+        sub = col.column(align=True)
+        sub.label("Viewport:")
+        subhq = sub.column()
+        subhq.active = hq_support
+        subhq.prop(dof_options, "use_high_quality")
+        sub.prop(dof_options, "fstop")
+        if dof_options.use_high_quality and hq_support:
+            sub.prop(dof_options, "blades")
 
         col = split.column()
 
@@ -490,11 +563,16 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
         ob = context.object
         slot = context.material_slot
         space = context.space_data
+        is_sortable = len(ob.material_slots) > 1
 
         if ob:
+            rows = 1
+            if (is_sortable):
+                rows = 4
+
             row = layout.row()
 
-            row.template_list("MATERIAL_UL_matslots", "", ob, "material_slots", ob, "active_material_index", rows=1)
+            row.template_list("MATERIAL_UL_matslots", "", ob, "material_slots", ob, "active_material_index", rows=rows)
 
             col = row.column(align=True)
             col.operator("object.material_slot_add", icon='ZOOMIN', text="")
@@ -502,6 +580,12 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
 
             col.menu("MATERIAL_MT_specials", icon='DOWNARROW_HLT', text="")
 
+            if is_sortable:
+                col.separator()
+
+                col.operator("object.material_slot_move", icon='TRIA_UP', text="").direction = 'UP'
+                col.operator("object.material_slot_move", icon='TRIA_DOWN', text="").direction = 'DOWN'
+
             if ob.mode == 'EDIT':
                 row = layout.row(align=True)
                 row.operator("object.material_slot_assign", text="Assign")
@@ -563,7 +647,13 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         ob = context.object
-        return CyclesButtonsPanel.poll(context) and ob and ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}
+        if CyclesButtonsPanel.poll(context) and ob:
+            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}:
+                return True
+            if ob.dupli_type == 'GROUP' and ob.dupli_group:
+                return True
+            # TODO(sergey): More duplicator types here?
+        return False
 
     def draw_header(self, context):
         layout = self.layout
@@ -606,8 +696,8 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         ob = context.object
         return (CyclesButtonsPanel.poll(context) and
-                ob and ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'} or
-                ob and ob.dupli_type == 'GROUP' and ob.dupli_group)
+                ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'}) or
+                        (ob.dupli_type == 'GROUP' and ob.dupli_group)))
 
     def draw(self, context):
         layout = self.layout
@@ -652,9 +742,14 @@ def find_node(material, nodetype):
     if material and material.node_tree:
         ntree = material.node_tree
 
+        active_output_node = None
         for node in ntree.nodes:
             if getattr(node, "type", None) == nodetype:
-                return node
+                if getattr(node, "is_active_output", True):
+                    return node
+                if not active_output_node:
+                    active_output_node = node
+        return active_output_node
 
     return None
 
@@ -691,7 +786,10 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return context.lamp and CyclesButtonsPanel.poll(context)
+        return context.lamp and \
+               not (context.lamp.type == 'AREA' and
+                    context.lamp.cycles.is_portal) \
+               and CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         self.layout.template_preview(context.lamp)
@@ -729,13 +827,21 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
                 sub.prop(lamp, "size", text="Size X")
                 sub.prop(lamp, "size_y", text="Size Y")
 
-        if cscene.progressive == 'BRANCHED_PATH':
-            col.prop(clamp, "samples")
-        col.prop(clamp, "max_bounces")
+        if not (lamp.type == 'AREA' and clamp.is_portal):
+            sub = col.column(align=True)
+            if use_branched_path(context):
+                sub.prop(clamp, "samples")
+            sub.prop(clamp, "max_bounces")
 
         col = split.column()
-        col.prop(clamp, "cast_shadow")
-        col.prop(clamp, "use_multiple_importance_sampling", text="Multiple Importance")
+
+        sub = col.column(align=True)
+        sub.active = not (lamp.type == 'AREA' and clamp.is_portal)
+        sub.prop(clamp, "cast_shadow")
+        sub.prop(clamp, "use_multiple_importance_sampling", text="Multiple Importance")
+
+        if lamp.type == 'AREA':
+            col.prop(clamp, "is_portal", text="Portal")
 
         if lamp.type == 'HEMI':
             layout.label(text="Not supported, interpreted as sun lamp")
@@ -747,7 +853,9 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return context.lamp and CyclesButtonsPanel.poll(context)
+        return context.lamp and not (context.lamp.type == 'AREA' and
+                                     context.lamp.cycles.is_portal) and \
+               CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         layout = self.layout
@@ -930,7 +1038,7 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.active = cworld.sample_as_light
         sub.prop(cworld, "sample_map_resolution")
-        if cscene.progressive == 'BRANCHED_PATH':
+        if use_branched_path(context):
             sub.prop(cworld, "samples")
 
         col = split.column()
@@ -1032,7 +1140,7 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cmat, "volume_sampling", text="")
-        col.prop(cmat, "volume_interpolation", text="")
+        sub.prop(cmat, "volume_interpolation", text="")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
 
         layout.separator()
@@ -1344,10 +1452,17 @@ class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
         rd = context.scene.render
 
         layout.active = rd.use_simplify
+        split = layout.split()
 
-        row = layout.row()
-        row.prop(rd, "simplify_subdivision", text="Subdivision")
-        row.prop(rd, "simplify_child_particles", text="Child Particles")
+        col = split.column()
+        col.label(text="Viewport:")
+        col.prop(rd, "simplify_subdivision", text="Subdivision")
+        col.prop(rd, "simplify_child_particles", text="Child Particles")
+
+        col = split.column()
+        col.label(text="Render:")
+        col.prop(rd, "simplify_subdivision_render", text="Subdivision")
+        col.prop(rd, "simplify_child_particles_render", text="Child Particles")
 
 
 def draw_device(self, context):
@@ -1419,6 +1534,7 @@ def get_panels():
         "DATA_PT_vertex_colors",
         "DATA_PT_camera",
         "DATA_PT_camera_display",
+        "DATA_PT_camera_stereoscopy",
         "DATA_PT_camera_safe_areas",
         "DATA_PT_lens",
         "DATA_PT_speaker",
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index dee9ee09fc6..90b42ea4ec2 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -72,7 +72,7 @@ struct BlenderCamera {
 	Transform matrix;
 };
 
-static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render, BL::Scene b_scene)
+static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render)
 {
 	memset(bcam, 0, sizeof(BlenderCamera));
 
@@ -95,7 +95,7 @@ static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings b_render
 	bcam->full_height = render_resolution_y(b_render);
 }
 
-static float blender_camera_focal_distance(BL::Object b_ob, BL::Camera b_camera)
+static float blender_camera_focal_distance(BL::RenderEngine b_engine, BL::Object b_ob, BL::Camera b_camera)
 {
 	BL::Object b_dof_object = b_camera.dof_object();
 
@@ -103,14 +103,16 @@ static float blender_camera_focal_distance(BL::Object b_ob, BL::Camera b_camera)
 		return b_camera.dof_distance();
 	
 	/* for dof object, return distance along camera Z direction */
-	Transform obmat = transform_clear_scale(get_transform(b_ob.matrix_world()));
+	BL::Array<float, 16> b_ob_matrix;
+	b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+	Transform obmat = get_transform(b_ob_matrix);
 	Transform dofmat = get_transform(b_dof_object.matrix_world());
 	Transform mat = transform_inverse(obmat) * dofmat;
 
 	return fabsf(transform_get_column(&mat, 3).z);
 }
 
-static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, bool skip_panorama = false)
+static void blender_camera_from_object(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::Object b_ob, bool skip_panorama = false)
 {
 	BL::ID b_ob_data = b_ob.data();
 
@@ -146,6 +148,9 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 			case 2:
 				bcam->panorama_type = PANORAMA_FISHEYE_EQUISOLID;
 				break;
+			case 3:
+				bcam->panorama_type = PANORAMA_MIRRORBALL;
+				break;
 			case 0:
 			default:
 				bcam->panorama_type = PANORAMA_EQUIRECTANGULAR;
@@ -181,10 +186,10 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 
 		bcam->apertureblades = RNA_int_get(&ccamera, "aperture_blades");
 		bcam->aperturerotation = RNA_float_get(&ccamera, "aperture_rotation");
-		bcam->focaldistance = blender_camera_focal_distance(b_ob, b_camera);
+		bcam->focaldistance = blender_camera_focal_distance(b_engine, b_ob, b_camera);
 		bcam->aperture_ratio = RNA_float_get(&ccamera, "aperture_ratio");
 
-		bcam->shift.x = b_camera.shift_x();
+		bcam->shift.x = b_engine.camera_shift_x(b_ob);
 		bcam->shift.y = b_camera.shift_y();
 
 		bcam->sensor_width = b_camera.sensor_width();
@@ -202,19 +207,34 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
 	}
 }
 
-static Transform blender_camera_matrix(const Transform& tfm, CameraType type)
+static Transform blender_camera_matrix(const Transform& tfm,
+                                       const CameraType type,
+                                       const PanoramaType panorama_type)
 {
 	Transform result;
 
 	if(type == CAMERA_PANORAMA) {
-		/* make it so environment camera needs to be pointed in the direction
-		 * of the positive x-axis to match an environment texture, this way
-		 * it is looking at the center of the texture */
-		result = tfm *
-			make_transform( 0.0f, -1.0f, 0.0f, 0.0f,
-			                0.0f,  0.0f, 1.0f, 0.0f,
-			               -1.0f,  0.0f, 0.0f, 0.0f,
-			                0.0f,  0.0f, 0.0f, 1.0f);
+		if(panorama_type == PANORAMA_MIRRORBALL) {
+			/* Mirror ball camera is looking into the negative Y direction
+			 * which matches texture mirror ball mapping.
+			 */
+			result = tfm *
+				make_transform(1.0f, 0.0f, 0.0f, 0.0f,
+				               0.0f, 0.0f, 1.0f, 0.0f,
+				               0.0f, 1.0f, 0.0f, 0.0f,
+				               0.0f, 0.0f, 0.0f, 1.0f);
+		}
+		else {
+			/* Make it so environment camera needs to be pointed in the direction
+			 * of the positive x-axis to match an environment texture, this way
+			 * it is looking at the center of the texture
+			 */
+			result = tfm *
+				make_transform( 0.0f, -1.0f, 0.0f, 0.0f,
+				                0.0f,  0.0f, 1.0f, 0.0f,
+				               -1.0f,  0.0f, 0.0f, 0.0f,
+				                0.0f,  0.0f, 0.0f, 1.0f);
+		}
 	}
 	else {
 		/* note the blender camera points along the negative z-axis */
@@ -303,7 +323,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 		&cam->viewplane, &aspectratio, &sensor_size);
 
 	/* panorama sensor */
-	if (bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
+	if(bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
 		float fit_xratio = (float)bcam->full_width*bcam->pixelaspect.x;
 		float fit_yratio = (float)bcam->full_height*bcam->pixelaspect.y;
 		bool horizontal_fit;
@@ -360,7 +380,9 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->bladesrotation = bcam->aperturerotation;
 
 	/* transform */
-	cam->matrix = blender_camera_matrix(bcam->matrix, bcam->type);
+	cam->matrix = blender_camera_matrix(bcam->matrix,
+	                                    bcam->type,
+	                                    bcam->panorama_type);
 	cam->motion.pre = cam->matrix;
 	cam->motion.post = cam->matrix;
 	cam->use_motion = false;
@@ -380,7 +402,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override, int width, int height)
 {
 	BlenderCamera bcam;
-	blender_camera_init(&bcam, b_render, b_scene);
+	blender_camera_init(&bcam, b_render);
 
 	/* pixel aspect */
 	bcam.pixelaspect.x = b_render.pixel_aspect_x();
@@ -402,8 +424,10 @@ void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override
 		b_ob = b_override;
 
 	if(b_ob) {
-		blender_camera_from_object(&bcam, b_ob);
-		bcam.matrix = get_transform(b_ob.matrix_world());
+		BL::Array<float, 16> b_ob_matrix;
+		blender_camera_from_object(&bcam, b_engine, b_ob);
+		b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+		bcam.matrix = get_transform(b_ob_matrix);
 	}
 
 	/* sync */
@@ -414,9 +438,10 @@ void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override
 void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
 {
 	Camera *cam = scene->camera;
-
-	Transform tfm = get_transform(b_ob.matrix_world());
-	tfm = blender_camera_matrix(tfm, cam->type);
+	BL::Array<float, 16> b_ob_matrix;
+	b_engine.camera_model_matrix(b_ob, b_ob_matrix);
+	Transform tfm = get_transform(b_ob_matrix);
+	tfm = blender_camera_matrix(tfm, cam->type, cam->panorama_type);
 
 	if(tfm != cam->matrix) {
 		VLOG(1) << "Camera " << b_ob.name() << " motion detected.";
@@ -433,10 +458,10 @@ void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
 
 /* Sync 3D View Camera */
 
-static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
+static void blender_camera_view_subset(BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height, BoundBox2D *view_box, BoundBox2D *cam_box);
 
-static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height, bool skip_panorama = false)
+static void blender_camera_from_view(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height, bool skip_panorama = false)
 {
 	/* 3d view parameters */
 	bcam->nearclip = b_v3d.clip_start();
@@ -449,13 +474,13 @@ static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL:
 		BL::Object b_ob = (b_v3d.lock_camera_and_layers())? b_scene.camera(): b_v3d.camera();
 
 		if(b_ob) {
-			blender_camera_from_object(bcam, b_ob, skip_panorama);
+			blender_camera_from_object(bcam, b_engine, b_ob, skip_panorama);
 
 			if(!skip_panorama && bcam->type == CAMERA_PANORAMA) {
 				/* in panorama camera view, we map viewplane to camera border */
 				BoundBox2D view_box, cam_box;
 
-				blender_camera_view_subset(b_scene.render(), b_scene, b_ob, b_v3d, b_rv3d, width, height,
+				blender_camera_view_subset(b_engine, b_scene.render(), b_scene, b_ob, b_v3d, b_rv3d, width, height,
 					&view_box, &cam_box);
 
 				bcam->pano_viewplane = view_box.make_relative_to(cam_box);
@@ -493,7 +518,7 @@ static void blender_camera_from_view(BlenderCamera *bcam, BL::Scene b_scene, BL:
 	bcam->matrix = transform_inverse(get_transform(b_rv3d.view_matrix()));
 }
 
-static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
+static void blender_camera_view_subset(BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::Object b_ob, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height, BoundBox2D *view_box, BoundBox2D *cam_box)
 {
 	BoundBox2D cam, view;
@@ -501,16 +526,16 @@ static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_
 
 	/* get viewport viewplane */
 	BlenderCamera view_bcam;
-	blender_camera_init(&view_bcam, b_render, b_scene);
-	blender_camera_from_view(&view_bcam, b_scene, b_v3d, b_rv3d, width, height, true);
+	blender_camera_init(&view_bcam, b_render);
+	blender_camera_from_view(&view_bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height, true);
 
 	blender_camera_viewplane(&view_bcam, width, height,
 		&view, &view_aspect, &sensor_size);
 
 	/* get camera viewplane */
 	BlenderCamera cam_bcam;
-	blender_camera_init(&cam_bcam, b_render, b_scene);
-	blender_camera_from_object(&cam_bcam, b_ob, true);
+	blender_camera_init(&cam_bcam, b_render);
+	blender_camera_from_object(&cam_bcam, b_engine, b_ob, true);
 
 	blender_camera_viewplane(&cam_bcam, cam_bcam.full_width, cam_bcam.full_height,
 		&cam, &cam_aspect, &sensor_size);
@@ -520,7 +545,8 @@ static void blender_camera_view_subset(BL::RenderSettings b_render, BL::Scene b_
 	*cam_box = cam * (1.0f/cam_aspect);
 }
 
-static void blender_camera_border_subset(BL::RenderSettings b_render,
+static void blender_camera_border_subset(BL::RenderEngine b_engine,
+                                         BL::RenderSettings b_render,
                                          BL::Scene b_scene,
                                          BL::SpaceView3D b_v3d,
                                          BL::RegionView3D b_rv3d,
@@ -531,7 +557,7 @@ static void blender_camera_border_subset(BL::RenderSettings b_render,
 {
 	/* Determine camera viewport subset. */
 	BoundBox2D view_box, cam_box;
-	blender_camera_view_subset(b_render, b_scene, b_ob, b_v3d, b_rv3d, width, height,
+	blender_camera_view_subset(b_engine, b_render, b_scene, b_ob, b_v3d, b_rv3d, width, height,
 	                           &view_box, &cam_box);
 
 	/* Determine viewport subset matching given border. */
@@ -539,7 +565,7 @@ static void blender_camera_border_subset(BL::RenderSettings b_render,
 	*result = cam_box.subset(border);
 }
 
-static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d,
+static void blender_camera_border(BlenderCamera *bcam, BL::RenderEngine b_engine, BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d,
 	BL::RegionView3D b_rv3d, int width, int height)
 {
 	bool is_camera_view;
@@ -568,7 +594,8 @@ static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_rend
 
 	/* Determine camera border inside the viewport. */
 	BoundBox2D full_border;
-	blender_camera_border_subset(b_render,
+	blender_camera_border_subset(b_engine,
+	                             b_render,
 	                             b_scene,
 	                             b_v3d,
 	                             b_rv3d,
@@ -587,7 +614,8 @@ static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_rend
 	bcam->border.top = b_render.border_max_y();
 
 	/* Determine viewport subset matching camera border. */
-	blender_camera_border_subset(b_render,
+	blender_camera_border_subset(b_engine,
+	                             b_render,
 	                             b_scene,
 	                             b_v3d,
 	                             b_rv3d,
@@ -601,14 +629,14 @@ static void blender_camera_border(BlenderCamera *bcam, BL::RenderSettings b_rend
 void BlenderSync::sync_view(BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height)
 {
 	BlenderCamera bcam;
-	blender_camera_init(&bcam, b_scene.render(), b_scene);
-	blender_camera_from_view(&bcam, b_scene, b_v3d, b_rv3d, width, height);
-	blender_camera_border(&bcam, b_scene.render(), b_scene, b_v3d, b_rv3d, width, height);
+	blender_camera_init(&bcam, b_scene.render());
+	blender_camera_from_view(&bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height);
+	blender_camera_border(&bcam, b_engine, b_scene.render(), b_scene, b_v3d, b_rv3d, width, height);
 
 	blender_camera_sync(scene->camera, &bcam, width, height);
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height)
+BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height)
 {
 	BufferParams params;
 	bool use_border = false;
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 72023d7e69b..dba801fc4df 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -44,8 +44,8 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData);
 void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
                                float3 RotCam, bool is_ortho);
 void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution);
-void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
 
 ParticleCurveData::ParticleCurveData()
 {
@@ -152,7 +152,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 					continue;
 
 				int ren_step = (1 << draw_step) + 1;
-				if (b_part.kink() == BL::ParticleSettings::kink_SPIRAL)
+				if(b_part.kink() == BL::ParticleSettings::kink_SPIRAL)
 					ren_step += b_part.kink_extra_steps();
 
 				PointerRNA cpsys = RNA_pointer_get(&b_part.ptr, "cycles");
@@ -233,10 +233,10 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
 				
-				if (b_part.child_type() == 0)
+				if(b_part.child_type() == 0)
 					totcurves += totparts;
 
-				if (totcurves == 0)
+				if(totcurves == 0)
 					continue;
 
 				int pa_no = 0;
@@ -287,10 +287,10 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
 				
-				if (b_part.child_type() == 0)
+				if(b_part.child_type() == 0)
 					totcurves += totparts;
 
-				if (totcurves == 0)
+				if(totcurves == 0)
 					continue;
 
 				int pa_no = 0;
@@ -322,11 +322,11 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 	return true;
 }
 
-static void set_resolution(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, BL::Scene *scene, bool render)
+static void set_resolution(BL::Object *b_ob, BL::Scene *scene, bool render)
 {
 	BL::Object::modifiers_iterator b_mod;
 	for(b_ob->modifiers.begin(b_mod); b_mod != b_ob->modifiers.end(); ++b_mod) {
-		if ((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) && ((b_mod->show_viewport()) || (b_mod->show_render()))) {
+		if((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) && ((b_mod->show_viewport()) || (b_mod->show_render()))) {
 			BL::ParticleSystemModifier psmd((const PointerRNA)b_mod->ptr);
 			BL::ParticleSystem b_psys((const PointerRNA)psmd.particle_system().ptr);
 			b_psys.set_resolution(*scene, *b_ob, (render)? 2: 1);
@@ -513,7 +513,7 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 
 				ybasis = normalize(cross(xbasis, v2));
 
-				for (; subv <= 1; subv++) {
+				for(; subv <= 1; subv++) {
 					float3 ickey_loc = make_float3(0.0f,0.0f,0.0f);
 					float time = 0.0f;
 
@@ -581,7 +581,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		}
 	}
 
-	if (num_curves > 0) {
+	if(num_curves > 0) {
 		VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
 	}
 
@@ -621,7 +621,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 	}
 
 	/* check allocation */
-	if((mesh->curve_keys.size() !=  num_keys) || (mesh->curves.size() !=  num_curves)) {
+	if((mesh->curve_keys.size() != num_keys) || (mesh->curves.size() != num_curves)) {
 		VLOG(1) << "Allocation failed, clearing data";
 		mesh->curve_keys.clear();
 		mesh->curves.clear();
@@ -629,7 +629,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 	}
 }
 
-static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveData *CData, int time_index)
+static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int time_index)
 {
 	VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name
 	        << ", time index " << time_index;
@@ -705,7 +705,7 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 	}
 }
 
-void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
 {
 	if(uvdata == NULL)
 		return;
@@ -750,7 +750,7 @@ void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset
 	}
 }
 
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
 {
 	if(cdata == NULL)
 		return;
@@ -886,7 +886,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	ParticleCurveData CData;
 
 	if(!preview)
-		set_resolution(mesh, &b_mesh, &b_ob, &b_scene, true);
+		set_resolution(&b_ob, &b_scene, true);
 
 	ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview);
 
@@ -917,7 +917,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	}
 	else {
 		if(motion)
-			ExportCurveSegmentsMotion(scene, mesh, &CData, time_index);
+			ExportCurveSegmentsMotion(mesh, &CData, time_index);
 		else
 			ExportCurveSegments(scene, mesh, &CData);
 	}
@@ -966,7 +966,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 
 				uchar4 *cdata = attr_vcol->data_uchar4();
 
-				ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, cdata);
+				ExportCurveTriangleVcol(&CData, tri_num * 3, used_res, cdata);
 			}
 			else {
 				Attribute *attr_vcol = mesh->curve_attributes.add(
@@ -1009,7 +1009,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 
 					float3 *uv = attr_uv->data_float3();
 
-					ExportCurveTriangleUV(mesh, &CData, tri_num * 3, used_res, uv);
+					ExportCurveTriangleUV(&CData, tri_num * 3, used_res, uv);
 				}
 				else {
 					if(active_render)
@@ -1032,7 +1032,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
 	}
 
 	if(!preview)
-		set_resolution(mesh, &b_mesh, &b_ob, &b_scene, false);
+		set_resolution(&b_ob, &b_scene, false);
 
 	mesh->compute_bounds();
 }
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index c70ffea39c3..d88ebb854d2 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -302,7 +302,7 @@ static void attr_create_uv_map(Scene *scene,
                                BL::Mesh b_mesh,
                                const vector<int>& nverts)
 {
-	if (b_mesh.tessface_uv_textures.length() != 0) {
+	if(b_mesh.tessface_uv_textures.length() != 0) {
 		BL::Mesh::tessface_uv_textures_iterator l;
 
 		for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
@@ -324,15 +324,15 @@ static void attr_create_uv_map(Scene *scene,
 				size_t i = 0;
 
 				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
-					fdata[0] =  get_float3(t->uv1());
-					fdata[1] =  get_float3(t->uv2());
-					fdata[2] =  get_float3(t->uv3());
+					fdata[0] = get_float3(t->uv1());
+					fdata[1] = get_float3(t->uv2());
+					fdata[2] = get_float3(t->uv3());
 					fdata += 3;
 
 					if(nverts[i] == 4) {
-						fdata[0] =  get_float3(t->uv1());
-						fdata[1] =  get_float3(t->uv3());
-						fdata[2] =  get_float3(t->uv4());
+						fdata[0] = get_float3(t->uv1());
+						fdata[1] = get_float3(t->uv3());
+						fdata[2] = get_float3(t->uv4());
 						fdata += 3;
 					}
 				}
@@ -621,7 +621,13 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	}
 	
 	/* test if we need to sync */
-	bool use_mesh_geometry = render_layer.use_surfaces || render_layer.use_hair;
+	int requested_geometry_flags = Mesh::GEOMETRY_NONE;
+	if(render_layer.use_surfaces) {
+		requested_geometry_flags |= Mesh::GEOMETRY_TRIANGLES;
+	}
+	if(render_layer.use_hair) {
+		requested_geometry_flags |= Mesh::GEOMETRY_CURVES;
+	}
 	Mesh *mesh;
 
 	if(!mesh_map.sync(&mesh, key)) {
@@ -630,7 +636,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 		/* test if shaders changed, these can be object level so mesh
 		 * does not get tagged for recalc */
 		else if(mesh->used_shaders != used_shaders);
-		else if(use_mesh_geometry != mesh->geometry_synced);
+		else if(requested_geometry_flags != mesh->geometry_flags);
 		else {
 			/* even if not tagged for recalc, we may need to sync anyway
 			 * because the shader needs different mesh attributes */
@@ -664,7 +670,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	mesh->used_shaders = used_shaders;
 	mesh->name = ustring(b_ob_data.name().c_str());
 
-	if(use_mesh_geometry) {
+	if(requested_geometry_flags != Mesh::GEOMETRY_NONE) {
 		/* mesh objects does have special handle in the dependency graph,
 		 * they're ensured to have properly updated.
 		 *
@@ -697,8 +703,8 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 			/* free derived mesh */
 			b_data.meshes.remove(b_mesh);
 		}
-		mesh->geometry_synced = true;
 	}
+	mesh->geometry_flags = requested_geometry_flags;
 
 	/* displacement method */
 	if(cmesh.data) {
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index e827d46223b..1e17d306fd0 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -90,14 +90,17 @@ static uint object_ray_visibility(BL::Object b_ob)
 
 /* Light */
 
-void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm)
+void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm, bool *use_portal)
 {
 	/* test if we need to sync */
 	Light *light;
 	ObjectKey key(b_parent, persistent_id, b_ob);
 
-	if(!light_map.sync(&light, b_ob, b_parent, key))
+	if(!light_map.sync(&light, b_ob, b_parent, key)) {
+		if(light->is_portal)
+			*use_portal = true;
 		return;
+	}
 	
 	BL::Lamp b_lamp(b_ob.data());
 
@@ -171,6 +174,14 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
 
 	light->max_bounces = get_int(clamp, "max_bounces");
 
+	if(light->type == LIGHT_AREA)
+		light->is_portal = get_boolean(clamp, "is_portal");
+	else
+		light->is_portal = false;
+
+	if(light->is_portal)
+		*use_portal = true;
+
 	/* visibility */
 	uint visibility = object_ray_visibility(b_ob);
 	light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
@@ -182,7 +193,7 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
 	light->tag_update(scene);
 }
 
-void BlenderSync::sync_background_light()
+void BlenderSync::sync_background_light(bool use_portal)
 {
 	BL::World b_world = b_scene.world();
 
@@ -191,19 +202,20 @@ void BlenderSync::sync_background_light()
 		PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 		bool sample_as_light = get_boolean(cworld, "sample_as_light");
 
-		if(sample_as_light) {
+		if(sample_as_light || use_portal) {
 			/* test if we need to sync */
 			Light *light;
 			ObjectKey key(b_world, 0, b_world);
 
 			if(light_map.sync(&light, b_world, b_world, key) ||
-			   world_recalc ||
-			   b_world.ptr.data != world_map)
+				world_recalc ||
+				b_world.ptr.data != world_map)
 			{
 				light->type = LIGHT_BACKGROUND;
 				light->map_resolution  = get_int(cworld, "sample_map_resolution");
 				light->shader = scene->default_background;
-				
+				light->use_mis = sample_as_light;
+
 				int samples = get_int(cworld, "samples");
 				if(get_boolean(cscene, "use_square_samples"))
 					light->samples = samples * samples;
@@ -223,7 +235,7 @@ void BlenderSync::sync_background_light()
 /* Object */
 
 Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
-                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris)
+                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris, bool *use_portal)
 {
 	BL::Object b_ob = (b_dupli_ob ? b_dupli_ob.object() : b_parent);
 	bool motion = motion_time != 0.0f;
@@ -232,7 +244,7 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(object_is_light(b_ob)) {
 		/* don't use lamps for excluded layers used as mask layer */
 		if(!motion && !((layer_flag & render_layer.holdout_layer) && (layer_flag & render_layer.exclude_layer)))
-			sync_light(b_parent, persistent_id, b_ob, tfm);
+			sync_light(b_parent, persistent_id, b_ob, tfm, use_portal);
 
 		return NULL;
 	}
@@ -249,7 +261,9 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(motion) {
 		object = object_map.find(key);
 
-		if(object && (scene->need_motion() == Scene::MOTION_PASS || object_use_motion(b_ob))) {
+		if(object && (scene->need_motion() == Scene::MOTION_PASS ||
+		              object_use_motion(b_parent, b_ob)))
+		{
 			/* object transformation */
 			if(tfm != object->tfm) {
 				VLOG(1) << "Object " << b_ob.name() << " motion detected.";
@@ -330,8 +344,8 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 
 			mesh->use_motion_blur = false;
 
-			if(object_use_motion(b_ob)) {
-				if(object_use_deform_motion(b_ob)) {
+			if(object_use_motion(b_parent, b_ob)) {
+				if(object_use_deform_motion(b_parent, b_ob)) {
 					mesh->motion_steps = object_motion_steps(b_ob);
 					mesh->use_motion_blur = true;
 				}
@@ -356,7 +370,7 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 			object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
 
 		/* dupli texture coordinates */
-		if (b_dupli_ob) {
+		if(b_dupli_ob) {
 			object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f);
 			object->dupli_uv = get_float2(b_dupli_ob.uv());
 		}
@@ -476,6 +490,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 	int dupli_settings = preview ? 1 : 2;
 
 	bool cancel = false;
+	bool use_portal = false;
 
 	for(; b_sce && !cancel; b_sce = b_sce.background_set()) {
 		for(b_sce.object_bases.begin(b_base); b_base != b_sce.object_bases.end() && !cancel; ++b_base) {
@@ -506,7 +521,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 							BL::Array<int, OBJECT_PERSISTENT_ID_SIZE> persistent_id = b_dup->persistent_id();
 
 							/* sync object and mesh or light data */
-							Object *object = sync_object(b_ob, persistent_id.data, *b_dup, tfm, ob_layer, motion_time, hide_tris);
+							Object *object = sync_object(b_ob, persistent_id.data, *b_dup, tfm, ob_layer, motion_time, hide_tris, &use_portal);
 
 							/* sync possible particle data, note particle_id
 							 * starts counting at 1, first is dummy particle */
@@ -526,7 +541,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 				if(!object_render_hide(b_ob, true, true, hide_tris)) {
 					/* object itself */
 					Transform tfm = get_transform(b_ob.matrix_world());
-					sync_object(b_ob, NULL, PointerRNA_NULL, tfm, ob_layer, motion_time, hide_tris);
+					sync_object(b_ob, NULL, PointerRNA_NULL, tfm, ob_layer, motion_time, hide_tris, &use_portal);
 				}
 			}
 
@@ -537,7 +552,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 	progress.set_sync_status("");
 
 	if(!cancel && !motion) {
-		sync_background_light();
+		sync_background_light(use_portal);
 
 		/* handle removed data and modified pointers */
 		if(light_map.post_sync())
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index 2785cfa9634..6d799e6e10e 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -76,7 +76,7 @@ bool BlenderSync::sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Ob
 
 	psys->particles.push_back(pa);
 
-	if (object->particle_index != psys->particles.size() - 1)
+	if(object->particle_index != psys->particles.size() - 1)
 		scene->object_manager->tag_update(scene);
 	object->particle_system = psys;
 	object->particle_index = psys->particles.size() - 1;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 292af14c63a..200003fbf70 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -25,6 +25,7 @@
 #include "util_md5.h"
 #include "util_opengl.h"
 #include "util_path.h"
+#include "util_types.h"
 
 #ifdef WITH_OSL
 #include "osl.h"
@@ -70,7 +71,7 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
 	return "";
 }
 
-static PyObject *init_func(PyObject *self, PyObject *args)
+static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *path, *user_path;
 	int headless;
@@ -90,7 +91,7 @@ static PyObject *init_func(PyObject *self, PyObject *args)
 	Py_RETURN_NONE;
 }
 
-static PyObject *create_func(PyObject *self, PyObject *args)
+static PyObject *create_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pyengine, *pyuserpref, *pydata, *pyscene, *pyregion, *pyv3d, *pyrv3d;
 	int preview_osl;
@@ -162,14 +163,14 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	return PyLong_FromVoidPtr(session);
 }
 
-static PyObject *free_func(PyObject *self, PyObject *value)
+static PyObject *free_func(PyObject * /*self*/, PyObject *value)
 {
 	delete (BlenderSession*)PyLong_AsVoidPtr(value);
 
 	Py_RETURN_NONE;
 }
 
-static PyObject *render_func(PyObject *self, PyObject *value)
+static PyObject *render_func(PyObject * /*self*/, PyObject *value)
 {
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(value);
 
@@ -183,14 +184,14 @@ static PyObject *render_func(PyObject *self, PyObject *value)
 }
 
 /* pixel_array and result passed as pointers */
-static PyObject *bake_func(PyObject *self, PyObject *args)
+static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pyobject;
 	PyObject *pypixel_array, *pyresult;
 	const char *pass_type;
-	int num_pixels, depth;
+	int num_pixels, depth, object_id;
 
-	if(!PyArg_ParseTuple(args, "OOsOiiO", &pysession, &pyobject, &pass_type, &pypixel_array,  &num_pixels, &depth, &pyresult))
+	if(!PyArg_ParseTuple(args, "OOsiOiiO", &pysession, &pyobject, &pass_type, &object_id, &pypixel_array, &num_pixels, &depth, &pyresult))
 		return NULL;
 
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
@@ -207,14 +208,14 @@ static PyObject *bake_func(PyObject *self, PyObject *args)
 
 	python_thread_state_save(&session->python_thread_state);
 
-	session->bake(b_object, pass_type, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result);
+	session->bake(b_object, pass_type, object_id, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result);
 
 	python_thread_state_restore(&session->python_thread_state);
 
 	Py_RETURN_NONE;
 }
 
-static PyObject *draw_func(PyObject *self, PyObject *args)
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pyv3d, *pyrv3d;
 
@@ -234,7 +235,7 @@ static PyObject *draw_func(PyObject *self, PyObject *args)
 	Py_RETURN_NONE;
 }
 
-static PyObject *reset_func(PyObject *self, PyObject *args)
+static PyObject *reset_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pysession, *pydata, *pyscene;
 
@@ -260,7 +261,7 @@ static PyObject *reset_func(PyObject *self, PyObject *args)
 	Py_RETURN_NONE;
 }
 
-static PyObject *sync_func(PyObject *self, PyObject *value)
+static PyObject *sync_func(PyObject * /*self*/, PyObject *value)
 {
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(value);
 
@@ -273,7 +274,7 @@ static PyObject *sync_func(PyObject *self, PyObject *value)
 	Py_RETURN_NONE;
 }
 
-static PyObject *available_devices_func(PyObject *self, PyObject *args)
+static PyObject *available_devices_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
 	PyObject *ret = PyTuple_New(devices.size());
@@ -288,7 +289,7 @@ static PyObject *available_devices_func(PyObject *self, PyObject *args)
 
 #ifdef WITH_OSL
 
-static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
+static PyObject *osl_update_node_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pynodegroup, *pynode;
 	const char *filepath = NULL;
@@ -390,7 +391,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 
 		/* find socket socket */
 		BL::NodeSocket b_sock(PointerRNA_NULL);
-		if (param->isoutput) {
+		if(param->isoutput) {
 			b_sock = b_node.outputs[param->name.string()];
 			/* remove if type no longer matches */
 			if(b_sock && b_sock.bl_idname() != socket_type) {
@@ -444,7 +445,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 
 		removed = false;
 
-		for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
+		for(b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
 			if(used_sockets.find(b_input->ptr.data) == used_sockets.end()) {
 				b_node.inputs.remove(*b_input);
 				removed = true;
@@ -452,7 +453,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 			}
 		}
 
-		for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
+		for(b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
 			if(used_sockets.find(b_output->ptr.data) == used_sockets.end()) {
 				b_node.outputs.remove(*b_output);
 				removed = true;
@@ -464,7 +465,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 	Py_RETURN_TRUE;
 }
 
-static PyObject *osl_compile_func(PyObject *self, PyObject *args)
+static PyObject *osl_compile_func(PyObject * /*self*/, PyObject *args)
 {
 	const char *inputfile = NULL, *outputfile = NULL;
 
@@ -479,7 +480,7 @@ static PyObject *osl_compile_func(PyObject *self, PyObject *args)
 }
 #endif
 
-static PyObject *system_info_func(PyObject *self, PyObject *value)
+static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
 {
 	string system_info = Device::device_capabilities();
 	return PyUnicode_FromString(system_info.c_str());
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index e61203d807a..b82289e007b 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -32,6 +32,7 @@
 #include "util_color.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_progress.h"
 #include "util_time.h"
 
@@ -133,7 +134,7 @@ void BlenderSession::create_session()
 	}
 
 	/* set buffer parameters */
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 	session->reset(buffer_params, session_params.samples);
 
 	b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -186,7 +187,7 @@ void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_)
 	sync->sync_integrator();
 	sync->sync_camera(b_render, b_engine.camera_override(), width, height);
 
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, PointerRNA_NULL, PointerRNA_NULL, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, PointerRNA_NULL, PointerRNA_NULL, scene->camera, width, height);
 	session->reset(buffer_params, session_params.samples);
 
 	b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -271,6 +272,10 @@ static PassType get_pass_type(BL::RenderPass b_pass)
 		{
 			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS)
 				return PASS_BVH_TRAVERSAL_STEPS;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES)
+				return PASS_BVH_TRAVERSED_INSTANCES;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_RAY_BOUNCES)
+				return PASS_RAY_BOUNCES;
 			break;
 		}
 #endif
@@ -331,9 +336,9 @@ static ShaderEvalType get_shader_type(const string& pass_type)
 		return SHADER_EVAL_BAKE;
 }
 
-static BL::RenderResult begin_render_result(BL::RenderEngine b_engine, int x, int y, int w, int h, const char *layername)
+static BL::RenderResult begin_render_result(BL::RenderEngine b_engine, int x, int y, int w, int h, const char *layername, const char *viewname)
 {
-	return b_engine.begin_result(x, y, w, h, layername);
+	return b_engine.begin_result(x, y, w, h, layername, viewname);
 }
 
 static void end_render_result(BL::RenderEngine b_engine, BL::RenderResult b_rr, bool cancel, bool do_merge_results)
@@ -350,10 +355,10 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 	int h = params.height;
 
 	/* get render result */
-	BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str());
+	BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
 
 	/* can happen if the intersected rectangle gives 0 width or height */
-	if (b_rr.ptr.data == NULL) {
+	if(b_rr.ptr.data == NULL) {
 		return;
 	}
 
@@ -366,10 +371,10 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 
 	BL::RenderLayer b_rlay = *b_single_rlay;
 
-	if (do_update_only) {
+	if(do_update_only) {
 		/* update only needed */
 
-		if (rtile.sample != 0) {
+		if(rtile.sample != 0) {
 			/* sample would be zero at initial tile update, which is only needed
 			 * to tag tile form blender side as IN PROGRESS for proper highlight
 			 * no buffers should be sent to blender yet
@@ -397,7 +402,7 @@ void BlenderSession::update_render_tile(RenderTile& rtile)
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
-	if (!b_engine.is_preview())
+	if(!b_engine.is_preview())
 		do_write_update_render_tile(rtile, true);
 	else
 		do_write_update_render_tile(rtile, false);
@@ -411,17 +416,18 @@ void BlenderSession::render()
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
 	/* render each layer */
 	BL::RenderSettings r = b_scene.render();
-	BL::RenderSettings::layers_iterator b_iter;
+	BL::RenderSettings::layers_iterator b_layer_iter;
+	BL::RenderResult::views_iterator b_view_iter;
 	
-	for(r.layers.begin(b_iter); b_iter != r.layers.end(); ++b_iter) {
-		b_rlay_name = b_iter->name();
+	for(r.layers.begin(b_layer_iter); b_layer_iter != r.layers.end(); ++b_layer_iter) {
+		b_rlay_name = b_layer_iter->name();
 
-		/* temporary render result to find needed passes */
-		BL::RenderResult b_rr = begin_render_result(b_engine, 0, 0, 1, 1, b_rlay_name.c_str());
+		/* temporary render result to find needed passes and views */
+		BL::RenderResult b_rr = begin_render_result(b_engine, 0, 0, 1, 1, b_rlay_name.c_str(), NULL);
 		BL::RenderResult::layers_iterator b_single_rlay;
 		b_rr.layers.begin(b_single_rlay);
 
@@ -438,6 +444,7 @@ void BlenderSession::render()
 		Pass::add(PASS_COMBINED, passes);
 #ifdef WITH_CYCLES_DEBUG
 		Pass::add(PASS_BVH_TRAVERSAL_STEPS, passes);
+		/* Pass::add(PASS_RAY_BOUNCES, passes); */
 #endif
 
 		if(session_params.device.advanced_shading) {
@@ -456,39 +463,54 @@ void BlenderSession::render()
 			}
 		}
 
-		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
-
 		buffer_params.passes = passes;
-		scene->film->pass_alpha_threshold = b_iter->pass_alpha_threshold();
+		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
 		scene->integrator->tag_update(scene);
 
-		/* update scene */
-		sync->sync_camera(b_render, b_engine.camera_override(), width, height);
-		sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state, b_rlay_name.c_str());
+		for(b_rr.views.begin(b_view_iter); b_view_iter != b_rr.views.end(); ++b_view_iter) {
+			b_rview_name = b_view_iter->name();
 
-		/* update number of samples per layer */
-		int samples = sync->get_layer_samples();
-		bool bound_samples = sync->get_layer_bound_samples();
+			/* set the current view */
+			b_engine.active_view_set(b_rview_name.c_str());
 
-		if(samples != 0 && (!bound_samples || (samples < session_params.samples)))
-			session->reset(buffer_params, samples);
-		else
-			session->reset(buffer_params, session_params.samples);
+			/* update scene */
+			sync->sync_camera(b_render, b_engine.camera_override(), width, height);
+			sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state, b_rlay_name.c_str());
 
-		/* render */
-		session->start();
-		session->wait();
+			/* update number of samples per layer */
+			int samples = sync->get_layer_samples();
+			bool bound_samples = sync->get_layer_bound_samples();
+
+			if(samples != 0 && (!bound_samples || (samples < session_params.samples)))
+				session->reset(buffer_params, samples);
+			else
+				session->reset(buffer_params, session_params.samples);
+
+			/* render */
+			session->start();
+			session->wait();
+
+			if(session->progress.get_cancel())
+				break;
+		}
+
+		/* free result without merging */
+		end_render_result(b_engine, b_rr, true, false);
 
 		if(session->progress.get_cancel())
 			break;
 	}
 
+	double total_time, render_time;
+	session->progress.get_time(total_time, render_time);
+	VLOG(1) << "Total render time: " << total_time;
+	VLOG(1) << "Render time (without synchronization): " << render_time;
+
 	/* clear callback */
-	session->write_render_tile_cb = NULL;
-	session->update_render_tile_cb = NULL;
+	session->write_render_tile_cb = function_null;
+	session->update_render_tile_cb = function_null;
 
 	/* free all memory used (host and device), so we wouldn't leave render
 	 * engine with extra memory allocated
@@ -500,18 +522,22 @@ void BlenderSession::render()
 	sync = NULL;
 }
 
-static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const int num_pixels)
+static void populate_bake_data(BakeData *data, const int object_id, BL::BakePixel pixel_array, const int num_pixels)
 {
 	BL::BakePixel bp = pixel_array;
 
 	int i;
 	for(i=0; i < num_pixels; i++) {
-		data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
+		if(bp.object_id() == object_id) {
+			data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
+		} else {
+			data->set_null(i);
+		}
 		bp = bp.next();
 	}
 }
 
-void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float result[])
+void BlenderSession::bake(BL::Object b_object, const string& pass_type, const int object_id, BL::BakePixel pixel_array, const size_t num_pixels, const int /*depth*/, float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
 	size_t object_index = OBJECT_NONE;
@@ -543,7 +569,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
 	scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
 	scene->bake_manager->set_baking(true);
@@ -567,7 +593,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 
 	BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
 
-	populate_bake_data(bake_data, pixel_array, num_pixels);
+	populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
 
 	/* set number of samples */
 	session->tile_manager.set_samples(session_params.samples);
@@ -601,7 +627,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult b_rr, BL::Re
 
 	vector<float> pixels(params.width*params.height*4);
 
-	if (!do_update_only) {
+	if(!do_update_only) {
 		/* copy each pass */
 		BL::RenderLayer::passes_iterator b_iter;
 
@@ -619,10 +645,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult b_rr, BL::Re
 			b_pass.rect(&pixels[0]);
 		}
 	}
-
-	/* copy combined pass */
-	if(buffers->get_pass_rect(PASS_COMBINED, exposure, rtile.sample, 4, &pixels[0]))
-		b_rlay.rect(&pixels[0]);
+	else {
+		/* copy combined pass */
+		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+		if(buffers->get_pass_rect(PASS_COMBINED, exposure, rtile.sample, 4, &pixels[0]))
+			b_combined_pass.rect(&pixels[0]);
+	}
 
 	/* tag result as updated */
 	b_engine.update_result(b_rr);
@@ -692,7 +720,7 @@ void BlenderSession::synchronize()
 
 	/* reset if needed */
 	if(scene->need_reset()) {
-		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 		session->reset(buffer_params, session_params.samples);
 
 		/* reset time */
@@ -747,7 +775,7 @@ bool BlenderSession::draw(int w, int h)
 		/* reset if requested */
 		if(reset) {
 			SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-			BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+			BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 			bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 			if(session_pause == false) {
@@ -764,7 +792,7 @@ bool BlenderSession::draw(int w, int h)
 	update_status_progress();
 
 	/* draw */
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 	DeviceDrawParams draw_params;
 
 	if(session->params.display_buffer_linear) {
@@ -841,6 +869,9 @@ void BlenderSession::update_status_progress()
 		scene += " | " + b_scene.name();
 		if(b_rlay_name != "")
 			scene += ", "  + b_rlay_name;
+
+		if(b_rview_name != "")
+			scene += ", " + b_rview_name;
 	}
 	else {
 		BLI_timestr(total_time, time_str, sizeof(time_str));
@@ -869,7 +900,7 @@ void BlenderSession::update_status_progress()
 		last_progress = progress;
 	}
 
-	if (session->progress.get_error()) {
+	if(session->progress.get_error()) {
 		string error = session->progress.get_error_message();
 		if(error != last_error) {
 			/* TODO(sergey): Currently C++ RNA API doesn't let us to
@@ -1000,18 +1031,19 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *buil
 
 	unsigned char *image_pixels;
 	image_pixels = image_get_pixels_for_frame(b_image, frame);
+	size_t num_pixels = ((size_t)width) * height;
 
 	if(image_pixels) {
-		memcpy(pixels, image_pixels, width * height * channels * sizeof(unsigned char));
+		memcpy(pixels, image_pixels, num_pixels * channels * sizeof(unsigned char));
 		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
-			memset(pixels, 0, width * height * sizeof(unsigned char));
+			memset(pixels, 0, num_pixels * sizeof(unsigned char));
 		}
 		else {
 			unsigned char *cp = pixels;
-			for(int i = 0; i < width * height; i++, cp += channels) {
+			for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 				cp[0] = 255;
 				cp[1] = 0;
 				cp[2] = 255;
@@ -1023,7 +1055,7 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *buil
 
 	/* premultiply, byte images are always straight for blender */
 	unsigned char *cp = pixels;
-	for(int i = 0; i < width * height; i++, cp += channels) {
+	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 		cp[0] = (cp[0] * cp[3]) >> 8;
 		cp[1] = (cp[1] * cp[3]) >> 8;
 		cp[2] = (cp[2] * cp[3]) >> 8;
@@ -1052,18 +1084,19 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 
 		float *image_pixels;
 		image_pixels = image_get_float_pixels_for_frame(b_image, frame);
+		size_t num_pixels = ((size_t)width) * height;
 
 		if(image_pixels) {
-			memcpy(pixels, image_pixels, width * height * channels * sizeof(float));
+			memcpy(pixels, image_pixels, num_pixels * channels * sizeof(float));
 			MEM_freeN(image_pixels);
 		}
 		else {
 			if(channels == 1) {
-				memset(pixels, 0, width * height * sizeof(float));
+				memset(pixels, 0, num_pixels * sizeof(float));
 			}
 			else {
 				float *fp = pixels;
-				for(int i = 0; i < width * height; i++, fp += channels) {
+				for(int i = 0; i < num_pixels; i++, fp += channels) {
 					fp[0] = 1.0f;
 					fp[1] = 0.0f;
 					fp[2] = 1.0f;
@@ -1089,11 +1122,12 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 		int width = resolution.x * amplify;
 		int height = resolution.y * amplify;
 		int depth = resolution.z * amplify;
+		size_t num_pixels = ((size_t)width) * height * depth;
 
 		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
 			SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth) {
+			if(length == num_pixels) {
 				SmokeDomainSettings_density_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
@@ -1103,7 +1137,7 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 			 * as 1500..3000 K with the first part faded to zero density */
 			SmokeDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth) {
+			if(length == num_pixels) {
 				SmokeDomainSettings_flame_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
@@ -1112,7 +1146,7 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 			/* the RGB is "premultiplied" by density for better interpolation results */
 			SmokeDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
 
-			if(length == width*height*depth*4) {
+			if(length == num_pixels*4) {
 				SmokeDomainSettings_color_grid_get(&b_domain.ptr, pixels);
 				return true;
 			}
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index c8070286006..708776dc8ca 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -52,7 +52,7 @@ public:
 	/* offline render */
 	void render();
 
-	void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]);
+	void bake(BL::Object b_object, const string& pass_type, const int object_id, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]);
 
 	void write_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile);
 	void write_render_tile(RenderTile& rtile);
@@ -90,6 +90,7 @@ public:
 	BL::SpaceView3D b_v3d;
 	BL::RegionView3D b_rv3d;
 	string b_rlay_name;
+	string b_rview_name;
 
 	string last_status;
 	string last_error;
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index baf79a78987..2b0e8acae38 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -106,7 +106,7 @@ static ShaderSocketType convert_socket_type(BL::NodeSocket b_socket)
 	}
 }
 
-static void set_default_value(ShaderInput *input, BL::Node b_node, BL::NodeSocket b_sock, BL::BlendData b_data, BL::ID b_id)
+static void set_default_value(ShaderInput *input, BL::NodeSocket b_sock, BL::BlendData b_data, BL::ID b_id)
 {
 	/* copy values for non linked inputs */
 	switch(input->type) {
@@ -179,53 +179,59 @@ static bool is_output_node(BL::Node b_node)
 		    || b_node.is_a(&RNA_ShaderNodeOutputLamp));
 }
 
-static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree, BL::ShaderNode b_node)
+static ShaderNode *add_node(Scene *scene,
+                            BL::RenderEngine b_engine,
+                            BL::BlendData b_data,
+                            BL::Scene b_scene,
+                            ShaderGraph *graph,
+                            BL::ShaderNodeTree b_ntree,
+                            BL::ShaderNode b_node)
 {
 	ShaderNode *node = NULL;
 
 	/* existing blender nodes */
-	if (b_node.is_a(&RNA_ShaderNodeRGBCurve)) {
+	if(b_node.is_a(&RNA_ShaderNodeRGBCurve)) {
 		BL::ShaderNodeRGBCurve b_curve_node(b_node);
 		RGBCurvesNode *curves = new RGBCurvesNode();
 		curvemapping_color_to_array(b_curve_node.mapping(), curves->curves, RAMP_TABLE_SIZE, true);
 		node = curves;
 	}
-	if (b_node.is_a(&RNA_ShaderNodeVectorCurve)) {
+	if(b_node.is_a(&RNA_ShaderNodeVectorCurve)) {
 		BL::ShaderNodeVectorCurve b_curve_node(b_node);
 		VectorCurvesNode *curves = new VectorCurvesNode();
 		curvemapping_color_to_array(b_curve_node.mapping(), curves->curves, RAMP_TABLE_SIZE, false);
 		node = curves;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeValToRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeValToRGB)) {
 		RGBRampNode *ramp = new RGBRampNode();
 		BL::ShaderNodeValToRGB b_ramp_node(b_node);
 		colorramp_to_array(b_ramp_node.color_ramp(), ramp->ramp, RAMP_TABLE_SIZE);
 		ramp->interpolate = b_ramp_node.color_ramp().interpolation() != BL::ColorRamp::interpolation_CONSTANT;
 		node = ramp;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeRGB)) {
 		ColorNode *color = new ColorNode();
 		color->value = get_node_output_rgba(b_node, "Color");
 		node = color;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeValue)) {
+	else if(b_node.is_a(&RNA_ShaderNodeValue)) {
 		ValueNode *value = new ValueNode();
 		value->value = get_node_output_value(b_node, "Value");
 		node = value;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCameraData)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCameraData)) {
 		node = new CameraNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeInvert)) {
+	else if(b_node.is_a(&RNA_ShaderNodeInvert)) {
 		node = new InvertNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeGamma)) {
+	else if(b_node.is_a(&RNA_ShaderNodeGamma)) {
 		node = new GammaNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBrightContrast)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBrightContrast)) {
 		node = new BrightContrastNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMixRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMixRGB)) {
 		BL::ShaderNodeMixRGB b_mix_node(b_node);
 		MixNode *mix = new MixNode();
 		mix->type = MixNode::type_enum[b_mix_node.blend_type()];
@@ -236,44 +242,44 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		mix->use_clamp = b_mix_node.use_clamp();
 		node = mix;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateRGB)) {
 		node = new SeparateRGBNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineRGB)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineRGB)) {
 		node = new CombineRGBNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateHSV)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateHSV)) {
 		node = new SeparateHSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
 		node = new CombineHSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
 		node = new SeparateXYZNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeCombineXYZ)) {
+	else if(b_node.is_a(&RNA_ShaderNodeCombineXYZ)) {
 		node = new CombineXYZNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHueSaturation)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHueSaturation)) {
 		node = new HSVNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
+	else if(b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
 		node = new ConvertNode(SHADER_SOCKET_COLOR, SHADER_SOCKET_FLOAT);
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMath)) {
 		BL::ShaderNodeMath b_math_node(b_node);
 		MathNode *math = new MathNode();
 		math->type = MathNode::type_enum[b_math_node.operation()];
 		math->use_clamp = b_math_node.use_clamp();
 		node = math;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVectorMath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVectorMath)) {
 		BL::ShaderNodeVectorMath b_vector_math_node(b_node);
 		VectorMathNode *vmath = new VectorMathNode();
 		vmath->type = VectorMathNode::type_enum[b_vector_math_node.operation()];
 		node = vmath;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
 		BL::ShaderNodeVectorTransform b_vector_transform_node(b_node);
 		VectorTransformNode *vtransform = new VectorTransformNode();
 		vtransform->type = VectorTransformNode::type_enum[b_vector_transform_node.type()];
@@ -281,7 +287,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		vtransform->convert_to = VectorTransformNode::convert_space_enum[b_vector_transform_node.convert_to()];
 		node = vtransform;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNormal)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNormal)) {
 		BL::Node::outputs_iterator out_it;
 		b_node.outputs.begin(out_it);
 
@@ -289,7 +295,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		norm->direction = get_node_output_vector(b_node, "Normal");
 		node = norm;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMapping)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMapping)) {
 		BL::ShaderNodeMapping b_mapping_node(b_node);
 		MappingNode *mapping = new MappingNode();
 
@@ -297,31 +303,31 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = mapping;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeFresnel)) {
+	else if(b_node.is_a(&RNA_ShaderNodeFresnel)) {
 		node = new FresnelNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLayerWeight)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLayerWeight)) {
 		node = new LayerWeightNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAddShader)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAddShader)) {
 		node = new AddClosureNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeMixShader)) {
+	else if(b_node.is_a(&RNA_ShaderNodeMixShader)) {
 		node = new MixClosureNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAttribute)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAttribute)) {
 		BL::ShaderNodeAttribute b_attr_node(b_node);
 		AttributeNode *attr = new AttributeNode();
 		attr->attribute = b_attr_node.attribute_name();
 		node = attr;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBackground)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBackground)) {
 		node = new BackgroundNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHoldout)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHoldout)) {
 		node = new HoldoutNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) {
 		BL::ShaderNodeBsdfAnisotropic b_aniso_node(b_node);
 		AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode();
 
@@ -340,10 +346,10 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = aniso;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) {
 		node = new DiffuseBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeSubsurfaceScattering)) {
+	else if(b_node.is_a(&RNA_ShaderNodeSubsurfaceScattering)) {
 		BL::ShaderNodeSubsurfaceScattering b_subsurface_node(b_node);
 
 		SubsurfaceScatteringNode *subsurface = new SubsurfaceScatteringNode();
@@ -359,7 +365,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 		node = subsurface;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfGlossy)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfGlossy)) {
 		BL::ShaderNodeBsdfGlossy b_glossy_node(b_node);
 		GlossyBsdfNode *glossy = new GlossyBsdfNode();
 		
@@ -379,7 +385,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = glossy;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfGlass)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfGlass)) {
 		BL::ShaderNodeBsdfGlass b_glass_node(b_node);
 		GlassBsdfNode *glass = new GlassBsdfNode();
 		switch(b_glass_node.distribution()) {
@@ -395,7 +401,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = glass;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfRefraction)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfRefraction)) {
 		BL::ShaderNodeBsdfRefraction b_refraction_node(b_node);
 		RefractionBsdfNode *refraction = new RefractionBsdfNode();
 		switch(b_refraction_node.distribution()) {
@@ -411,7 +417,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = refraction;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfToon)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfToon)) {
 		BL::ShaderNodeBsdfToon b_toon_node(b_node);
 		ToonBsdfNode *toon = new ToonBsdfNode();
 		switch(b_toon_node.component()) {
@@ -424,7 +430,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = toon;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfHair)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfHair)) {
 		BL::ShaderNodeBsdfHair b_hair_node(b_node);
 		HairBsdfNode *hair = new HairBsdfNode();
 		switch(b_hair_node.component()) {
@@ -437,64 +443,64 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = hair;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
 		node = new TranslucentBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfTransparent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfTransparent)) {
 		node = new TransparentBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBsdfVelvet)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfVelvet)) {
 		node = new VelvetBsdfNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeEmission)) {
+	else if(b_node.is_a(&RNA_ShaderNodeEmission)) {
 		node = new EmissionNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeAmbientOcclusion)) {
+	else if(b_node.is_a(&RNA_ShaderNodeAmbientOcclusion)) {
 		node = new AmbientOcclusionNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVolumeScatter)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVolumeScatter)) {
 		node = new ScatterVolumeNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) {
+	else if(b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) {
 		node = new AbsorptionVolumeNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNewGeometry)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNewGeometry)) {
 		node = new GeometryNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeWireframe)) {
+	else if(b_node.is_a(&RNA_ShaderNodeWireframe)) {
 		BL::ShaderNodeWireframe b_wireframe_node(b_node);
 		WireframeNode *wire = new WireframeNode();
 		wire->use_pixel_size = b_wireframe_node.use_pixel_size();
 		node = wire;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeWavelength)) {
+	else if(b_node.is_a(&RNA_ShaderNodeWavelength)) {
 		node = new WavelengthNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBlackbody)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBlackbody)) {
 		node = new BlackbodyNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLightPath)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLightPath)) {
 		node = new LightPathNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeLightFalloff)) {
+	else if(b_node.is_a(&RNA_ShaderNodeLightFalloff)) {
 		node = new LightFalloffNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeObjectInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeObjectInfo)) {
 		node = new ObjectInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeParticleInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeParticleInfo)) {
 		node = new ParticleInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeHairInfo)) {
+	else if(b_node.is_a(&RNA_ShaderNodeHairInfo)) {
 		node = new HairInfoNode();
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeBump)) {
+	else if(b_node.is_a(&RNA_ShaderNodeBump)) {
 		BL::ShaderNodeBump b_bump_node(b_node);
 		BumpNode *bump = new BumpNode();
 		bump->invert = b_bump_node.invert();
 		node = bump;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeScript)) {
+	else if(b_node.is_a(&RNA_ShaderNodeScript)) {
 #ifdef WITH_OSL
 		if(scene->shader_manager->use_osl()) {
 			/* create script node */
@@ -510,16 +516,16 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			 * Socket names must be stored in the extra lists instead. */
 			BL::Node::inputs_iterator b_input;
 
-			for (b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
+			for(b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
 				script_node->input_names.push_back(ustring(b_input->name()));
 				ShaderInput *input = script_node->add_input(script_node->input_names.back().c_str(),
 				                                            convert_socket_type(*b_input));
-				set_default_value(input, b_node, *b_input, b_data, b_ntree);
+				set_default_value(input, *b_input, b_data, b_ntree);
 			}
 
 			BL::Node::outputs_iterator b_output;
 
-			for (b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
+			for(b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
 				script_node->output_names.push_back(ustring(b_output->name()));
 				script_node->add_output(script_node->output_names.back().c_str(),
 				                        convert_socket_type(*b_output));
@@ -543,9 +549,12 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 			node = script_node;
 		}
+#else
+		(void)b_data;
+		(void)b_ntree;
 #endif
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexImage)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexImage)) {
 		BL::ShaderNodeTexImage b_image_node(b_node);
 		BL::Image b_image(b_image_node.image());
 		ImageTextureNode *image = new ImageTextureNode();
@@ -555,7 +564,8 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			 */
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
-			                  b_image.source() == BL::Image::source_MOVIE;
+			                  b_image.source() == BL::Image::source_MOVIE ||
+			                  b_engine.is_preview();
 
 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -578,7 +588,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			image->use_alpha = b_image.use_alpha();
 
 			/* TODO(sergey): Does not work properly when we change builtin type. */
-			if (b_image.is_updated()) {
+			if(b_image.is_updated()) {
 				scene->image_manager->tag_reload_image(image->filename,
 				                                       image->builtin_data,
 				                                       (InterpolationType)b_image_node.interpolation());
@@ -591,14 +601,15 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&image->tex_mapping, b_image_node.texture_mapping());
 		node = image;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
 		BL::ShaderNodeTexEnvironment b_env_node(b_node);
 		BL::Image b_image(b_env_node.image());
 		EnvironmentTextureNode *env = new EnvironmentTextureNode();
 		if(b_image) {
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
-			                  b_image.source() == BL::Image::source_MOVIE;
+			                  b_image.source() == BL::Image::source_MOVIE ||
+			                  b_engine.is_preview();
 
 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -615,7 +626,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			env->use_alpha = b_image.use_alpha();
 
 			/* TODO(sergey): Does not work properly when we change builtin type. */
-			if (b_image.is_updated()) {
+			if(b_image.is_updated()) {
 				scene->image_manager->tag_reload_image(env->filename,
 				                                       env->builtin_data,
 				                                       INTERPOLATION_LINEAR);
@@ -626,41 +637,41 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&env->tex_mapping, b_env_node.texture_mapping());
 		node = env;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexGradient)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexGradient)) {
 		BL::ShaderNodeTexGradient b_gradient_node(b_node);
 		GradientTextureNode *gradient = new GradientTextureNode();
 		gradient->type = GradientTextureNode::type_enum[(int)b_gradient_node.gradient_type()];
 		get_tex_mapping(&gradient->tex_mapping, b_gradient_node.texture_mapping());
 		node = gradient;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
 		BL::ShaderNodeTexVoronoi b_voronoi_node(b_node);
 		VoronoiTextureNode *voronoi = new VoronoiTextureNode();
 		voronoi->coloring = VoronoiTextureNode::coloring_enum[(int)b_voronoi_node.coloring()];
 		get_tex_mapping(&voronoi->tex_mapping, b_voronoi_node.texture_mapping());
 		node = voronoi;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexMagic)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexMagic)) {
 		BL::ShaderNodeTexMagic b_magic_node(b_node);
 		MagicTextureNode *magic = new MagicTextureNode();
 		magic->depth = b_magic_node.turbulence_depth();
 		get_tex_mapping(&magic->tex_mapping, b_magic_node.texture_mapping());
 		node = magic;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexWave)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexWave)) {
 		BL::ShaderNodeTexWave b_wave_node(b_node);
 		WaveTextureNode *wave = new WaveTextureNode();
 		wave->type = WaveTextureNode::type_enum[(int)b_wave_node.wave_type()];
 		get_tex_mapping(&wave->tex_mapping, b_wave_node.texture_mapping());
 		node = wave;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexChecker)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexChecker)) {
 		BL::ShaderNodeTexChecker b_checker_node(b_node);
 		CheckerTextureNode *checker = new CheckerTextureNode();
 		get_tex_mapping(&checker->tex_mapping, b_checker_node.texture_mapping());
 		node = checker;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexBrick)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexBrick)) {
 		BL::ShaderNodeTexBrick b_brick_node(b_node);
 		BrickTextureNode *brick = new BrickTextureNode();
 		brick->offset = b_brick_node.offset();
@@ -670,20 +681,20 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&brick->tex_mapping, b_brick_node.texture_mapping());
 		node = brick;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexNoise)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexNoise)) {
 		BL::ShaderNodeTexNoise b_noise_node(b_node);
 		NoiseTextureNode *noise = new NoiseTextureNode();
 		get_tex_mapping(&noise->tex_mapping, b_noise_node.texture_mapping());
 		node = noise;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
 		BL::ShaderNodeTexMusgrave b_musgrave_node(b_node);
 		MusgraveTextureNode *musgrave = new MusgraveTextureNode();
 		musgrave->type = MusgraveTextureNode::type_enum[(int)b_musgrave_node.musgrave_type()];
 		get_tex_mapping(&musgrave->tex_mapping, b_musgrave_node.texture_mapping());
 		node = musgrave;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexCoord)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexCoord)) {
 		BL::ShaderNodeTexCoord b_tex_coord_node(b_node);
 		TextureCoordinateNode *tex_coord = new TextureCoordinateNode();
 		tex_coord->from_dupli = b_tex_coord_node.from_dupli();
@@ -693,7 +704,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		}
 		node = tex_coord;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTexSky)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTexSky)) {
 		BL::ShaderNodeTexSky b_sky_node(b_node);
 		SkyTextureNode *sky = new SkyTextureNode();
 		sky->type = SkyTextureNode::type_enum[(int)b_sky_node.sky_type()];
@@ -703,14 +714,14 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		get_tex_mapping(&sky->tex_mapping, b_sky_node.texture_mapping());
 		node = sky;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeNormalMap)) {
+	else if(b_node.is_a(&RNA_ShaderNodeNormalMap)) {
 		BL::ShaderNodeNormalMap b_normal_map_node(b_node);
 		NormalMapNode *nmap = new NormalMapNode();
 		nmap->space = NormalMapNode::space_enum[(int)b_normal_map_node.space()];
 		nmap->attribute = b_normal_map_node.uv_map();
 		node = nmap;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeTangent)) {
+	else if(b_node.is_a(&RNA_ShaderNodeTangent)) {
 		BL::ShaderNodeTangent b_tangent_node(b_node);
 		TangentNode *tangent = new TangentNode();
 		tangent->direction_type = TangentNode::direction_type_enum[(int)b_tangent_node.direction_type()];
@@ -718,7 +729,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		tangent->attribute = b_tangent_node.uv_map();
 		node = tangent;
 	}
-	else if (b_node.is_a(&RNA_ShaderNodeUVMap)) {
+	else if(b_node.is_a(&RNA_ShaderNodeUVMap)) {
 		BL::ShaderNodeUVMap b_uvmap_node(b_node);
 		UVMapNode *uvm = new UVMapNode();
 		uvm->attribute = b_uvmap_node.uv_map();
@@ -734,7 +745,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 
 static bool node_use_modified_socket_name(ShaderNode *node)
 {
-	if (node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
+	if(node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
 		return false;
 
 	return true;
@@ -744,14 +755,14 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B
 {
 	string name = b_socket.name();
 	
-	if (node_use_modified_socket_name(node)) {
+	if(node_use_modified_socket_name(node)) {
 		BL::Node::inputs_iterator b_input;
 		bool found = false;
 		int counter = 0, total = 0;
 
-		for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
-			if (b_input->name() == name) {
-				if (!found)
+		for(b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
+			if(b_input->name() == name) {
+				if(!found)
 					counter++;
 				total++;
 			}
@@ -761,10 +772,10 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B
 		}
 
 		/* rename if needed */
-		if (name == "Shader")
+		if(name == "Shader")
 			name = "Closure";
 
-		if (total > 1)
+		if(total > 1)
 			name = string_printf("%s%d", name.c_str(), counter);
 	}
 
@@ -775,14 +786,14 @@ static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node,
 {
 	string name = b_socket.name();
 
-	if (node_use_modified_socket_name(node)) {
+	if(node_use_modified_socket_name(node)) {
 		BL::Node::outputs_iterator b_output;
 		bool found = false;
 		int counter = 0, total = 0;
 
-		for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
-			if (b_output->name() == name) {
-				if (!found)
+		for(b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
+			if(b_output->name() == name) {
+				if(!found)
 					counter++;
 				total++;
 			}
@@ -792,18 +803,24 @@ static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node,
 		}
 
 		/* rename if needed */
-		if (name == "Shader")
+		if(name == "Shader")
 			name = "Closure";
 
-		if (total > 1)
+		if(total > 1)
 			name = string_printf("%s%d", name.c_str(), counter);
 	}
 
 	return node->output(name.c_str());
 }
 
-static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree,
-                      const ProxyMap &proxy_input_map, const ProxyMap &proxy_output_map)
+static void add_nodes(Scene *scene,
+                      BL::RenderEngine b_engine,
+                      BL::BlendData b_data,
+                      BL::Scene b_scene,
+                      ShaderGraph *graph,
+                      BL::ShaderNodeTree b_ntree,
+                      const ProxyMap &proxy_input_map,
+                      const ProxyMap &proxy_output_map)
 {
 	/* add nodes */
 	BL::ShaderNodeTree::nodes_iterator b_node;
@@ -818,7 +835,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	BL::ShaderNode output_node(PointerRNA_NULL);
 
 	for(b_ntree.nodes.begin(b_node); b_node != b_ntree.nodes.end(); ++b_node) {
-		if (is_output_node(*b_node)) {
+		if(is_output_node(*b_node)) {
 			BL::ShaderNodeOutputMaterial b_output_node(*b_node);
 
 			if(b_output_node.is_active_output()) {
@@ -834,10 +851,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 
 	/* add nodes */
 	for(b_ntree.nodes.begin(b_node); b_node != b_ntree.nodes.end(); ++b_node) {
-		if (b_node->mute() || b_node->is_a(&RNA_NodeReroute)) {
+		if(b_node->mute() || b_node->is_a(&RNA_NodeReroute)) {
 			/* replace muted node with internal links */
 			BL::Node::internal_links_iterator b_link;
-			for (b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
+			for(b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
 				ProxyNode *proxy = new ProxyNode(convert_socket_type(b_link->to_socket()));
 
 				input_map[b_link->from_socket().ptr.data] = proxy->inputs[0];
@@ -846,10 +863,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 				graph->add(proxy);
 			}
 		}
-		else if (b_node->is_a(&RNA_ShaderNodeGroup) || b_node->is_a(&RNA_NodeCustomGroup)) {
+		else if(b_node->is_a(&RNA_ShaderNodeGroup) || b_node->is_a(&RNA_NodeCustomGroup)) {
 			
 			BL::ShaderNodeTree b_group_ntree(PointerRNA_NULL);
-			if (b_node->is_a(&RNA_ShaderNodeGroup))
+			if(b_node->is_a(&RNA_ShaderNodeGroup))
 				b_group_ntree = BL::ShaderNodeTree(((BL::NodeGroup)(*b_node)).node_tree());
 			else
 				b_group_ntree = BL::ShaderNodeTree(((BL::NodeCustomGroup)(*b_node)).node_tree());
@@ -868,7 +885,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 
 				input_map[b_input->ptr.data] = proxy->inputs[0];
 
-				set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
+				set_default_value(proxy->inputs[0], *b_input, b_data, b_ntree);
 			}
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_output));
@@ -880,33 +897,41 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 				output_map[b_output->ptr.data] = proxy->outputs[0];
 			}
 			
-			if (b_group_ntree)
-				add_nodes(scene, b_data, b_scene, graph, b_group_ntree, group_proxy_input_map, group_proxy_output_map);
+			if(b_group_ntree) {
+				add_nodes(scene,
+				          b_engine,
+				          b_data,
+				          b_scene,
+				          graph,
+				          b_group_ntree,
+				          group_proxy_input_map,
+				          group_proxy_output_map);
+			}
 		}
-		else if (b_node->is_a(&RNA_NodeGroupInput)) {
+		else if(b_node->is_a(&RNA_NodeGroupInput)) {
 			/* map each socket to a proxy node */
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				ProxyMap::const_iterator proxy_it = proxy_input_map.find(b_output->identifier());
-				if (proxy_it != proxy_input_map.end()) {
+				if(proxy_it != proxy_input_map.end()) {
 					ProxyNode *proxy = proxy_it->second;
 
 					output_map[b_output->ptr.data] = proxy->outputs[0];
 				}
 			}
 		}
-		else if (b_node->is_a(&RNA_NodeGroupOutput)) {
+		else if(b_node->is_a(&RNA_NodeGroupOutput)) {
 			BL::NodeGroupOutput b_output_node(*b_node);
 			/* only the active group output is used */
-			if (b_output_node.is_active_output()) {
+			if(b_output_node.is_active_output()) {
 				/* map each socket to a proxy node */
 				for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 					ProxyMap::const_iterator proxy_it = proxy_output_map.find(b_input->identifier());
-					if (proxy_it != proxy_output_map.end()) {
+					if(proxy_it != proxy_output_map.end()) {
 						ProxyNode *proxy = proxy_it->second;
 
 						input_map[b_input->ptr.data] = proxy->inputs[0];
 
-						set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
+						set_default_value(proxy->inputs[0], *b_input, b_data, b_ntree);
 					}
 				}
 			}
@@ -914,30 +939,36 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 		else {
 			ShaderNode *node = NULL;
 
-			if (is_output_node(*b_node)) {
-				if (b_node->ptr.data == output_node.ptr.data) {
+			if(is_output_node(*b_node)) {
+				if(b_node->ptr.data == output_node.ptr.data) {
 					node = graph->output();
 				}
 			}
 			else {
-				node = add_node(scene, b_data, b_scene, graph, b_ntree, BL::ShaderNode(*b_node));
+				node = add_node(scene,
+				                b_engine,
+				                b_data,
+				                b_scene,
+				                graph,
+				                b_ntree,
+				                BL::ShaderNode(*b_node));
 			}
 
 			if(node) {
 				/* map node sockets for linking */
 				for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 					ShaderInput *input = node_find_input_by_name(node, *b_node, *b_input);
-					if (!input) {
+					if(!input) {
 						/* XXX should not happen, report error? */
 						continue;
 					}
 					input_map[b_input->ptr.data] = input;
 
-					set_default_value(input, *b_node, *b_input, b_data, b_ntree);
+					set_default_value(input, *b_input, b_data, b_ntree);
 				}
 				for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 					ShaderOutput *output = node_find_output_by_name(node, *b_node, *b_output);
-					if (!output) {
+					if(!output) {
 						/* XXX should not happen, report error? */
 						continue;
 					}
@@ -951,6 +982,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	BL::NodeTree::links_iterator b_link;
 
 	for(b_ntree.links.begin(b_link); b_link != b_ntree.links.end(); ++b_link) {
+		/* Ignore invalid links to avoid unwanted cycles created in graph. */
+		if(!b_link->is_valid()) {
+			continue;
+		}
 		/* get blender link data */
 		BL::NodeSocket b_from_sock = b_link->from_socket();
 		BL::NodeSocket b_to_sock = b_link->to_socket();
@@ -959,10 +994,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 		ShaderInput *input = 0;
 
 		PtrOutputMap::iterator output_it = output_map.find(b_from_sock.ptr.data);
-		if (output_it != output_map.end())
+		if(output_it != output_map.end())
 			output = output_it->second;
 		PtrInputMap::iterator input_it = input_map.find(b_to_sock.ptr.data);
-		if (input_it != input_map.end())
+		if(input_it != input_map.end())
 			input = input_it->second;
 
 		/* either node may be NULL when the node was not exported, typically
@@ -972,10 +1007,22 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
 	}
 }
 
-static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, ShaderGraph *graph, BL::ShaderNodeTree b_ntree)
+static void add_nodes(Scene *scene,
+                      BL::RenderEngine b_engine,
+                      BL::BlendData b_data,
+                      BL::Scene b_scene,
+                      ShaderGraph *graph,
+                      BL::ShaderNodeTree b_ntree)
 {
 	static const ProxyMap empty_proxy_map;
-	add_nodes(scene, b_data, b_scene, graph, b_ntree, empty_proxy_map, empty_proxy_map);
+	add_nodes(scene,
+	          b_engine,
+	          b_data,
+	          b_scene,
+	          graph,
+	          b_ntree,
+	          empty_proxy_map,
+	          empty_proxy_map);
 }
 
 /* Sync Materials */
@@ -1001,7 +1048,7 @@ void BlenderSync::sync_materials(bool update_all)
 			if(b_mat->use_nodes() && b_mat->node_tree()) {
 				BL::ShaderNodeTree b_ntree(b_mat->node_tree());
 
-				add_nodes(scene, b_data, b_scene, graph, b_ntree);
+				add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 			}
 			else {
 				ShaderNode *closure, *out;
@@ -1044,7 +1091,7 @@ void BlenderSync::sync_world(bool update_all)
 		if(b_world && b_world.use_nodes() && b_world.node_tree()) {
 			BL::ShaderNodeTree b_ntree(b_world.node_tree());
 
-			add_nodes(scene, b_data, b_scene, graph, b_ntree);
+			add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 
 			/* volume */
 			PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
@@ -1130,7 +1177,7 @@ void BlenderSync::sync_lamps(bool update_all)
 
 				BL::ShaderNodeTree b_ntree(b_lamp->node_tree());
 
-				add_nodes(scene, b_data, b_scene, graph, b_ntree);
+				add_nodes(scene, b_engine, b_data, b_scene, graph, b_ntree);
 			}
 			else {
 				ShaderNode *closure, *out;
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 985a3cd3d5a..c5fb7925306 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -36,6 +36,7 @@
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_opengl.h"
+#include "util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,7 +112,7 @@ bool BlenderSync::sync_recalc()
 		
 		if(b_ob->is_updated_data()) {
 			BL::Object::particle_systems_iterator b_psys;
-			for (b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end(); ++b_psys)
+			for(b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end(); ++b_psys)
 				particle_system_map.set_recalc(*b_ob);
 		}
 	}
@@ -150,6 +151,7 @@ void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, void *
 	sync_integrator();
 	sync_film();
 	sync_shaders();
+	sync_images();
 	sync_curve_settings();
 
 	mesh_synced.clear(); /* use for objects and motion sync */
@@ -194,6 +196,9 @@ void BlenderSync::sync_integrator()
 	integrator->filter_glossy = get_float(cscene, "blur_glossy");
 
 	integrator->seed = get_int(cscene, "seed");
+	if(get_boolean(cscene, "use_animated_seed"))
+		integrator->seed = hash_int_2d(b_scene.frame_current(), get_int(cscene, "seed"));
+
 	integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern");
 
 	integrator->layer_flag = render_layer.layer;
@@ -360,6 +365,39 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
 	}
 }
 
+/* Images */
+void BlenderSync::sync_images()
+{
+	/* Sync is a convention for this API, but currently it frees unused buffers. */
+
+	const bool is_interface_locked = b_engine.render() &&
+	                                 b_engine.render().use_lock_interface();
+	if(is_interface_locked == false && BlenderSession::headless == false) {
+		/* If interface is not locked, it's possible image is needed for
+		 * the display.
+		 */
+		return;
+	}
+	/* Free buffers used by images which are not needed for render. */
+	BL::BlendData::images_iterator b_image;
+	for(b_data.images.begin(b_image);
+	    b_image != b_data.images.end();
+	    ++b_image)
+	{
+		/* TODO(sergey): Consider making it an utility function to check
+		 * whether image is considered builtin.
+		 */
+		const bool is_builtin = b_image->packed_file() ||
+		                        b_image->source() == BL::Image::source_GENERATED ||
+		                        b_image->source() == BL::Image::source_MOVIE ||
+		                        b_engine.is_preview();
+		if(is_builtin == false) {
+			b_image->buffers_free();
+		}
+		/* TODO(sergey): Free builtin images not used by any shader. */
+	}
+}
+
 /* Scene Parameters */
 
 SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background, bool is_cpu)
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 6a320ac8085..89d93e19e9f 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -68,7 +68,7 @@ public:
 	                                        BL::Scene b_scene,
 	                                        bool background);
 	static bool get_session_pause(BL::Scene b_scene, bool background);
-	static BufferParams get_buffer_params(BL::RenderSettings b_render, BL::Scene b_scene, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height);
+	static BufferParams get_buffer_params(BL::RenderSettings b_render, BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, Camera *cam, int width, int height);
 
 private:
 	/* sync */
@@ -86,15 +86,18 @@ private:
 	Mesh *sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tris);
 	void sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool motion, int time_index = 0);
 	Object *sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
-	                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris);
-	void sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm);
-	void sync_background_light();
+	                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris, bool *use_portal);
+	void sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm, bool *use_portal);
+	void sync_background_light(bool use_portal);
 	void sync_mesh_motion(BL::Object b_ob, Object *object, float motion_time);
 	void sync_camera_motion(BL::Object b_ob, float motion_time);
 
 	/* particles */
 	bool sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Object *object);
 
+	/* Images. */
+	void sync_images();
+
 	/* util */
 	void find_shader(BL::ID id, vector<uint>& used_shaders, int default_shader);
 	bool BKE_object_is_modified(BL::Object b_ob);
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 9f7181cc564..78a0adbef00 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -43,8 +43,8 @@ void python_thread_state_restore(void **python_thread_state);
 static inline BL::Mesh object_to_mesh(BL::BlendData data, BL::Object object, BL::Scene scene, bool apply_modifiers, bool render, bool calc_undeformed)
 {
 	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);
-	if ((bool)me) {
-		if (me.use_auto_smooth()) {
+	if((bool)me) {
+		if(me.use_auto_smooth()) {
 			me.calc_normals_split();
 		}
 		me.calc_tessface(true);
@@ -310,7 +310,7 @@ static inline string get_string(PointerRNA& ptr, const char *name)
 	char cstrbuf[1024];
 	char *cstr = RNA_string_get_alloc(&ptr, name, cstrbuf, sizeof(cstrbuf));
 	string str(cstr);
-	if (cstr != cstrbuf)
+	if(cstr != cstrbuf)
 		MEM_freeN(cstr);
 	
 	return str;
@@ -354,11 +354,20 @@ static inline void mesh_texture_space(BL::Mesh b_mesh, float3& loc, float3& size
 }
 
 /* object used for motion blur */
-static inline bool object_use_motion(BL::Object b_ob)
+static inline bool object_use_motion(BL::Object b_parent, BL::Object b_ob)
 {
 	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
 	bool use_motion = get_boolean(cobject, "use_motion_blur");
-	
+	/* If motion blur is enabled for the object we also check
+	 * whether it's enabled for the parent object as well.
+	 *
+	 * This way we can control motion blur from the dupligroup
+	 * duplicator much easier.
+	 */
+	if(use_motion && b_parent.ptr.data != b_ob.ptr.data) {
+		PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles");
+		use_motion &= get_boolean(parent_cobject, "use_motion_blur");
+	}
 	return use_motion;
 }
 
@@ -375,11 +384,20 @@ static inline uint object_motion_steps(BL::Object b_ob)
 }
 
 /* object uses deformation motion blur */
-static inline bool object_use_deform_motion(BL::Object b_ob)
+static inline bool object_use_deform_motion(BL::Object b_parent, BL::Object b_ob)
 {
 	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
 	bool use_deform_motion = get_boolean(cobject, "use_deform_motion");
-	
+	/* If motion blur is enabled for the object we also check
+	 * whether it's enabled for the parent object as well.
+	 *
+	 * This way we can control motion blur from the dupligroup
+	 * duplicator much easier.
+	 */
+	if(use_deform_motion && b_parent.ptr.data != b_ob.ptr.data) {
+		PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles");
+		use_deform_motion &= get_boolean(parent_cobject, "use_deform_motion");
+	}
 	return use_deform_motion;
 }
 
@@ -388,7 +406,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object b_ob)
 	BL::Object::modifiers_iterator b_mod;
 
 	for(b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
-		if (b_mod->is_a(&RNA_SmokeModifier)) {
+		if(b_mod->is_a(&RNA_SmokeModifier)) {
 			BL::SmokeModifier b_smd(*b_mod);
 
 			if(b_smd.smoke_type() == BL::SmokeModifier::smoke_type_DOMAIN)
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 9fa602f0952..350ca16f6e2 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -28,6 +28,7 @@
 #include "util_cache.h"
 #include "util_debug.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -106,25 +107,25 @@ bool BVH::cache_read(CacheData& key)
 		if(!(value.read(pack.root_index) &&
 		     value.read(pack.SAH) &&
 		     value.read(pack.nodes) &&
+		     value.read(pack.leaf_nodes) &&
 		     value.read(pack.object_node) &&
 		     value.read(pack.tri_woop) &&
 		     value.read(pack.prim_type) &&
 		     value.read(pack.prim_visibility) &&
 		     value.read(pack.prim_index) &&
-		     value.read(pack.prim_object) &&
-		     value.read(pack.is_leaf)))
+		     value.read(pack.prim_object)))
 		{
 			/* Clear the pack if load failed. */
 			pack.root_index = 0;
 			pack.SAH = 0.0f;
 			pack.nodes.clear();
+			pack.leaf_nodes.clear();
 			pack.object_node.clear();
 			pack.tri_woop.clear();
 			pack.prim_type.clear();
 			pack.prim_visibility.clear();
 			pack.prim_index.clear();
 			pack.prim_object.clear();
-			pack.is_leaf.clear();
 			return false;
 		}
 		return true;
@@ -141,13 +142,13 @@ void BVH::cache_write(CacheData& key)
 	value.add(pack.SAH);
 
 	value.add(pack.nodes);
+	value.add(pack.leaf_nodes);
 	value.add(pack.object_node);
 	value.add(pack.tri_woop);
 	value.add(pack.prim_type);
 	value.add(pack.prim_visibility);
 	value.add(pack.prim_index);
 	value.add(pack.prim_object);
-	value.add(pack.is_leaf);
 
 	Cache::global.insert(key, value);
 
@@ -189,11 +190,12 @@ void BVH::build(Progress& progress)
 	}
 
 	/* build nodes */
-	vector<int> prim_type;
-	vector<int> prim_index;
-	vector<int> prim_object;
-
-	BVHBuild bvh_build(objects, prim_type, prim_index, prim_object, params, progress);
+	BVHBuild bvh_build(objects,
+	                   pack.prim_type,
+	                   pack.prim_index,
+	                   pack.prim_object,
+	                   params,
+	                   progress);
 	BVHNode *root = bvh_build.run();
 
 	if(progress.get_cancel()) {
@@ -201,14 +203,6 @@ void BVH::build(Progress& progress)
 		return;
 	}
 
-	/* todo: get rid of this copy */
-	pack.prim_type = prim_type;
-	pack.prim_index = prim_index;
-	pack.prim_object = prim_object;
-	prim_type.free_memory();
-	prim_index.free_memory();
-	prim_object.free_memory();
-
 	/* compute SAH */
 	if(!params.top_level)
 		pack.SAH = root->computeSubtreeSAHCost(params);
@@ -322,13 +316,14 @@ void BVH::pack_primitives()
 
 /* Pack Instances */
 
-void BVH::pack_instances(size_t nodes_size)
+void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 {
 	/* The BVH's for instances are built separately, but for traversal all
 	 * BVH's are stored in global arrays. This function merges them into the
 	 * top level BVH, adjusting indexes and offsets where appropriate. */
 	bool use_qbvh = params.use_qbvh;
 	size_t nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+	size_t nsize_leaf = (use_qbvh)? BVH_QNODE_LEAF_SIZE: BVH_NODE_LEAF_SIZE;
 
 	/* adjust primitive index to point to the triangle in the global array, for
 	 * meshes with transform applied and already in the top level BVH */
@@ -343,6 +338,7 @@ void BVH::pack_instances(size_t nodes_size)
 	/* track offsets of instanced BVH data in global array */
 	size_t prim_offset = pack.prim_index.size();
 	size_t nodes_offset = nodes_size;
+	size_t nodes_leaf_offset = leaf_nodes_size;
 
 	/* clear array that gives the node indexes for instanced objects */
 	pack.object_node.clear();
@@ -354,6 +350,7 @@ void BVH::pack_instances(size_t nodes_size)
 	size_t pack_prim_index_offset = prim_index_size;
 	size_t pack_tri_woop_offset = tri_woop_size;
 	size_t pack_nodes_offset = nodes_size;
+	size_t pack_leaf_nodes_offset = leaf_nodes_size;
 	size_t object_offset = 0;
 
 	map<Mesh*, int> mesh_map;
@@ -367,6 +364,7 @@ void BVH::pack_instances(size_t nodes_size)
 				prim_index_size += bvh->pack.prim_index.size();
 				tri_woop_size += bvh->pack.tri_woop.size();
 				nodes_size += bvh->pack.nodes.size();
+				leaf_nodes_size += bvh->pack.leaf_nodes.size();
 
 				mesh_map[mesh] = 1;
 			}
@@ -381,6 +379,7 @@ void BVH::pack_instances(size_t nodes_size)
 	pack.prim_visibility.resize(prim_index_size);
 	pack.tri_woop.resize(tri_woop_size);
 	pack.nodes.resize(nodes_size);
+	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
 
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
@@ -389,6 +388,7 @@ void BVH::pack_instances(size_t nodes_size)
 	uint *pack_prim_visibility = (pack.prim_visibility.size())? &pack.prim_visibility[0]: NULL;
 	float4 *pack_tri_woop = (pack.tri_woop.size())? &pack.tri_woop[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
+	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
 
 	/* merge */
 	foreach(Object *ob, objects) {
@@ -414,12 +414,13 @@ void BVH::pack_instances(size_t nodes_size)
 		BVH *bvh = mesh->bvh;
 
 		int noffset = nodes_offset/nsize;
+		int noffset_leaf = nodes_leaf_offset/nsize_leaf;
 		int mesh_tri_offset = mesh->tri_offset;
 		int mesh_curve_offset = mesh->curve_offset;
 
 		/* fill in node indexes for instances */
-		if((bvh->pack.is_leaf.size() != 0) && bvh->pack.is_leaf[0])
-			pack.object_node[object_offset++] = -noffset-1;
+		if(bvh->pack.root_index == -1)
+			pack.object_node[object_offset++] = -noffset_leaf-1;
 		else
 			pack.object_node[object_offset++] = noffset;
 
@@ -453,6 +454,18 @@ void BVH::pack_instances(size_t nodes_size)
 		}
 
 		/* merge nodes */
+		if(bvh->pack.leaf_nodes.size()) {
+			int4 *leaf_nodes_offset = &bvh->pack.leaf_nodes[0];
+			size_t leaf_nodes_offset_size = bvh->pack.leaf_nodes.size();
+			for(size_t i = 0, j = 0; i < leaf_nodes_offset_size; i+=nsize_leaf, j++) {
+				int4 data = leaf_nodes_offset[i];
+				data.x += prim_offset;
+				data.y += prim_offset;
+				pack_leaf_nodes[pack_leaf_nodes_offset] = data;
+				pack_leaf_nodes_offset += nsize_leaf;
+			}
+		}
+
 		if(bvh->pack.nodes.size()) {
 			/* For QBVH we're packing a child bbox into 6 float4,
 			 * and for regular BVH they're packed into 3 float4.
@@ -460,7 +473,6 @@ void BVH::pack_instances(size_t nodes_size)
 			size_t nsize_bbox = (use_qbvh)? 6: 3;
 			int4 *bvh_nodes = &bvh->pack.nodes[0];
 			size_t bvh_nodes_size = bvh->pack.nodes.size(); 
-			bool *bvh_is_leaf = (bvh->pack.is_leaf.size() != 0) ? &bvh->pack.is_leaf[0] : NULL;
 
 			for(size_t i = 0, j = 0; i < bvh_nodes_size; i+=nsize, j++) {
 				memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox*sizeof(int4));
@@ -468,18 +480,12 @@ void BVH::pack_instances(size_t nodes_size)
 				/* modify offsets into arrays */
 				int4 data = bvh_nodes[i + nsize_bbox];
 
-				if(bvh_is_leaf && bvh_is_leaf[j]) {
-					data.x += prim_offset;
-					data.y += prim_offset;
-				}
-				else {
-					data.x += (data.x < 0)? -noffset: noffset;
-					data.y += (data.y < 0)? -noffset: noffset;
+				data.x += (data.x < 0)? -noffset_leaf: noffset;
+				data.y += (data.y < 0)? -noffset_leaf: noffset;
 
-					if(use_qbvh) {
-						data.z += (data.z < 0)? -noffset: noffset;
-						data.w += (data.w < 0)? -noffset: noffset;
-					}
+				if(use_qbvh) {
+					data.z += (data.z < 0)? -noffset_leaf: noffset;
+					data.w += (data.w < 0)? -noffset_leaf: noffset;
 				}
 
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
@@ -496,6 +502,7 @@ void BVH::pack_instances(size_t nodes_size)
 		}
 
 		nodes_offset += bvh->pack.nodes.size();
+		nodes_leaf_offset += bvh->pack.leaf_nodes.size();
 		prim_offset += bvh->pack.prim_index.size();
 	}
 }
@@ -509,20 +516,24 @@ RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_
 
 void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
+	float4 data[BVH_NODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
 	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
 		/* object */
-		pack_node(e.idx, leaf->m_bounds, leaf->m_bounds, ~(leaf->m_lo), 0,
-		          leaf->m_visibility, leaf->m_visibility);
+		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].y = __int_as_float(0);
 	}
 	else {
-		int prim_type = leaf->num_triangles() ? pack.prim_type[leaf->m_lo] : 0;
-		/* Triangle/curve primitive leaf.  */
-		pack_node(e.idx, leaf->m_bounds, leaf->m_bounds,
-		          leaf->m_lo, leaf->m_hi,
-		          leaf->m_visibility,
-		          prim_type);
+		/* triangle */
+		data[0].x = __int_as_float(leaf->m_lo);
+		data[0].y = __int_as_float(leaf->m_hi);
+	}
+	data[0].z = __uint_as_float(leaf->m_visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
 	}
 
+	memcpy(&pack.leaf_nodes[e.idx * BVH_NODE_LEAF_SIZE], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
 }
 
 void RegularBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1)
@@ -545,31 +556,36 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int
 
 void RegularBVH::pack_nodes(const BVHNode *root)
 {
-	size_t node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	size_t node_size = tot_node_size - leaf_node_size;
 
 	/* resize arrays */
 	pack.nodes.clear();
-	pack.is_leaf.clear();
-	pack.is_leaf.resize(node_size);
 
 	/* for top level BVH, first merge existing BVH's so we know the offsets */
-	if(params.top_level)
-		pack_instances(node_size*BVH_NODE_SIZE);
-	else
+	if(params.top_level) {
+		pack_instances(node_size*BVH_NODE_SIZE,
+		               leaf_node_size*BVH_NODE_LEAF_SIZE);
+	}
+	else {
 		pack.nodes.resize(node_size*BVH_NODE_SIZE);
+		pack.leaf_nodes.resize(leaf_node_size*BVH_NODE_LEAF_SIZE);
+	}
 
-	int nextNodeIdx = 0;
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
 
 	vector<BVHStackEntry> stack;
 	stack.reserve(BVHParams::MAX_DEPTH*2);
-	stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	if(root->is_leaf())
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	else
+		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
 
 	while(stack.size()) {
 		BVHStackEntry e = stack.back();
 		stack.pop_back();
 
-		pack.is_leaf[e.idx] = e.node->is_leaf();
-
 		if(e.node->is_leaf()) {
 			/* leaf node */
 			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
@@ -577,15 +593,17 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 		}
 		else {
 			/* innner node */
-			stack.push_back(BVHStackEntry(e.node->get_child(0), nextNodeIdx++));
-			stack.push_back(BVHStackEntry(e.node->get_child(1), nextNodeIdx++));
+			int idx0 = (e.node->get_child(0)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
+			int idx1 = (e.node->get_child(1)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
+			stack.push_back(BVHStackEntry(e.node->get_child(0), idx0));
+			stack.push_back(BVHStackEntry(e.node->get_child(1), idx1));
 
 			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
 		}
 	}
 
 	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (pack.is_leaf[0])? -1: 0;
+	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
 void RegularBVH::refit_nodes()
@@ -594,17 +612,15 @@ void RegularBVH::refit_nodes()
 
 	BoundBox bbox = BoundBox::empty;
 	uint visibility = 0;
-	refit_node(0, (pack.is_leaf[0])? true: false, bbox, visibility);
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
 void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
-	int4 *data = &pack.nodes[idx*BVH_NODE_SIZE];
-
-	int c0 = data[3].x;
-	int c1 = data[3].y;
-
 	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx*BVH_NODE_LEAF_SIZE];
+		int c0 = data[0].x;
+		int c1 = data[0].y;
 		/* refit leaf node */
 		for(int prim = c0; prim < c1; prim++) {
 			int pidx = pack.prim_index[prim];
@@ -638,7 +654,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 							size_t steps = mesh->motion_steps - 1;
 							float4 *key_steps = attr->data_float4();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
 						}
 					}
@@ -660,7 +676,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 							size_t steps = mesh->motion_steps - 1;
 							float3 *vert_steps = attr->data_float3();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
 						}
 					}
@@ -670,9 +686,20 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 			visibility |= ob->visibility;
 		}
 
-		pack_node(idx, bbox, bbox, c0, c1, visibility, data[3].w);
+		/* TODO(sergey): De-duplicate with pack_leaf(). */
+		float4 leaf_data[BVH_NODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c0);
+		leaf_data[0].y = __int_as_float(c1);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(data[0].w);
+		memcpy(&pack.leaf_nodes[idx * BVH_NODE_LEAF_SIZE],
+		       leaf_data,
+		       sizeof(float4)*BVH_NODE_LEAF_SIZE);
 	}
 	else {
+		int4 *data = &pack.nodes[idx*BVH_NODE_SIZE];
+		int c0 = data[3].x;
+		int c1 = data[3].y;
 		/* refit inner node, set bbox from children */
 		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
 		uint visibility0 = 0, visibility1 = 0;
@@ -698,26 +725,24 @@ QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
 
 void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
-	float4 data[BVH_QNODE_SIZE];
-
+	float4 data[BVH_QNODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-
 	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
 		/* object */
-		data[6].x = __int_as_float(~(leaf->m_lo));
-		data[6].y = __int_as_float(0);
+		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[6].x = __int_as_float(leaf->m_lo);
-		data[6].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->m_lo);
+		data[0].y = __int_as_float(leaf->m_hi);
 	}
-	data[6].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->m_visibility);
 	if(leaf->num_triangles() != 0) {
-		data[6].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
 	}
 
-	memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE);
+	memcpy(&pack.leaf_nodes[e.idx * BVH_QNODE_LEAF_SIZE], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
 }
 
 void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
@@ -761,31 +786,39 @@ void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
 
 void QBVH::pack_nodes(const BVHNode *root)
 {
-	size_t node_size = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	size_t node_size = tot_node_size - leaf_node_size;
 
 	/* resize arrays */
 	pack.nodes.clear();
-	pack.is_leaf.clear();
-	pack.is_leaf.resize(node_size);
+	pack.leaf_nodes.clear();
 
 	/* for top level BVH, first merge existing BVH's so we know the offsets */
-	if(params.top_level)
-		pack_instances(node_size*BVH_QNODE_SIZE);
-	else
+	if(params.top_level) {
+		pack_instances(node_size*BVH_QNODE_SIZE,
+		               leaf_node_size*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
 		pack.nodes.resize(node_size*BVH_QNODE_SIZE);
+		pack.leaf_nodes.resize(leaf_node_size*BVH_QNODE_LEAF_SIZE);
+	}
 
-	int nextNodeIdx = 0;
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
 
 	vector<BVHStackEntry> stack;
 	stack.reserve(BVHParams::MAX_DEPTH*2);
-	stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	}
 
 	while(stack.size()) {
 		BVHStackEntry e = stack.back();
 		stack.pop_back();
 
-		pack.is_leaf[e.idx] = e.node->is_leaf();
-
 		if(e.node->is_leaf()) {
 			/* leaf node */
 			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
@@ -818,8 +851,16 @@ void QBVH::pack_nodes(const BVHNode *root)
 			}
 
 			/* push entries on the stack */
-			for(int i = 0; i < numnodes; i++)
-				stack.push_back(BVHStackEntry(nodes[i], nextNodeIdx++));
+			for(int i = 0; i < numnodes; i++) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx++;
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
 
 			/* set node */
 			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
@@ -827,7 +868,7 @@ void QBVH::pack_nodes(const BVHNode *root)
 	}
 
 	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (pack.is_leaf[0])? -1: 0;
+	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
 void QBVH::refit_nodes()
@@ -836,14 +877,14 @@ void QBVH::refit_nodes()
 
 	BoundBox bbox = BoundBox::empty;
 	uint visibility = 0;
-	refit_node(0, (pack.is_leaf[0])? true: false, bbox, visibility);
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
 void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
-	int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE];
-	int4 c = data[6];
 	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx*BVH_QNODE_LEAF_SIZE];
+		int4 c = data[0];
 		/* Refit leaf node. */
 		for(int prim = c.x; prim < c.y; prim++) {
 			int pidx = pack.prim_index[prim];
@@ -877,7 +918,7 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 							size_t steps = mesh->motion_steps - 1;
 							float4 *key_steps = attr->data_float4();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
 						}
 					}
@@ -899,7 +940,7 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 							size_t steps = mesh->motion_steps - 1;
 							float3 *vert_steps = attr->data_float3();
 
-							for (size_t i = 0; i < steps; i++)
+							for(size_t i = 0; i < steps; i++)
 								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
 						}
 					}
@@ -919,17 +960,18 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 		 *
 		 * Same applies to the inner nodes case below.
 		 */
-		float4 leaf_data[BVH_QNODE_SIZE];
-		memset(leaf_data, 0, sizeof(leaf_data));
-		leaf_data[6].x = __int_as_float(c.x);
-		leaf_data[6].y = __int_as_float(c.y);
-		leaf_data[6].z = __uint_as_float(visibility);
-		leaf_data[6].w = __uint_as_float(c.w);
-		memcpy(&pack.nodes[idx * BVH_QNODE_SIZE],
+		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx * BVH_QNODE_LEAF_SIZE],
 		       leaf_data,
-		       sizeof(float4)*BVH_QNODE_SIZE);
+		       sizeof(float4)*BVH_QNODE_LEAF_SIZE);
 	}
 	else {
+		int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE];
+		int4 c = data[6];
 		/* Refit inner node, set bbox from children. */
 		BoundBox child_bbox[4] = {BoundBox::empty,
 		                          BoundBox::empty,
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 40f039541eb..669d2ccdcd5 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -36,7 +36,9 @@ class Object;
 class Progress;
 
 #define BVH_NODE_SIZE	4
+#define BVH_NODE_LEAF_SIZE	1
 #define BVH_QNODE_SIZE	7
+#define BVH_QNODE_LEAF_SIZE	1
 #define BVH_ALIGN		4096
 #define TRI_NODE_SIZE	3
 
@@ -47,7 +49,9 @@ class Progress;
 struct PackedBVH {
 	/* BVH nodes storage, one node is 4x int4, and contains two bounding boxes,
 	 * and child, triangle or object indexes depending on the node type */
-	array<int4> nodes; 
+	array<int4> nodes;
+	/* BVH leaf nodes storage. */
+	array<int4> leaf_nodes;
 	/* object index to BVH node index mapping for instances */
 	array<int> object_node; 
 	/* precomputed triangle intersection data, one triangle is 4x float4 */
@@ -61,9 +65,6 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
-	/* quick array to lookup if a node is a leaf, not used for traversal, only
-	 * for instance BVH merging  */
-	array<bool> is_leaf;
 
 	/* index of the root node. */
 	int root_index;
@@ -108,7 +109,7 @@ protected:
 	void pack_triangle(int idx, float4 woop[3]);
 
 	/* merge instance BVH's */
-	void pack_instances(size_t nodes_size);
+	void pack_instances(size_t nodes_size, size_t leaf_nodes_size);
 
 	/* for subclasses to implement */
 	virtual void pack_nodes(const BVHNode *root) = 0;
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index bd37ffbcf38..db96490a36f 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -29,10 +29,10 @@ CCL_NAMESPACE_BEGIN
 
 /* SSE replacements */
 
-__forceinline void prefetch_L1 (const void* ptr) { }
-__forceinline void prefetch_L2 (const void* ptr) { }
-__forceinline void prefetch_L3 (const void* ptr) { }
-__forceinline void prefetch_NTA(const void* ptr) { }
+__forceinline void prefetch_L1 (const void* /*ptr*/) { }
+__forceinline void prefetch_L2 (const void* /*ptr*/) { }
+__forceinline void prefetch_L3 (const void* /*ptr*/) { }
+__forceinline void prefetch_NTA(const void* /*ptr*/) { }
 
 template<size_t src> __forceinline float extract(const int4& b)
 { return b[src]; }
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 4ce8f787169..a44ad656316 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -42,7 +42,7 @@ ccl_device_inline int bitscan(int value)
 {
 	assert(value != 0);
 	int bit = 0;
-	while (value >>= 1) {
+	while(value >>= 1) {
 		++bit;
 	}
 	return bit;
@@ -65,15 +65,18 @@ public:
 /* Constructor / Destructor */
 
 BVHBuild::BVHBuild(const vector<Object*>& objects_,
-	vector<int>& prim_type_, vector<int>& prim_index_, vector<int>& prim_object_,
-	const BVHParams& params_, Progress& progress_)
-: objects(objects_),
-  prim_type(prim_type_),
-  prim_index(prim_index_),
-  prim_object(prim_object_),
-  params(params_),
-  progress(progress_),
-  progress_start_time(0.0)
+                   array<int>& prim_type_,
+                   array<int>& prim_index_,
+                   array<int>& prim_object_,
+                   const BVHParams& params_,
+                   Progress& progress_)
+ : objects(objects_),
+   prim_type(prim_type_),
+   prim_index(prim_index_),
+   prim_object(prim_object_),
+   params(params_),
+   progress(progress_),
+   progress_start_time(0.0)
 {
 	spatial_min_overlap = 0.0f;
 }
@@ -136,7 +139,7 @@ void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh,
 				size_t steps = mesh->motion_steps - 1;
 				float4 *key_steps = curve_attr_mP->data_float4();
 
-				for (size_t i = 0; i < steps; i++)
+				for(size_t i = 0; i < steps; i++)
 					curve.bounds_grow(k, key_steps + i*mesh_size, bounds);
 
 				type = PRIMITIVE_MOTION_CURVE;
@@ -446,18 +449,10 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		return new LeafNode(bounds, 0, 0, 0);
 	}
 	else if(num == 1) {
-		if(start == prim_index.size()) {
-			assert(params.use_spatial_split);
-
-			prim_type.push_back(ref->prim_type());
-			prim_index.push_back(ref->prim_index());
-			prim_object.push_back(ref->prim_object());
-		}
-		else {
-			prim_type[start] = ref->prim_type();
-			prim_index[start] = ref->prim_index();
-			prim_object[start] = ref->prim_object();
-		}
+		assert(start < prim_type.size());
+		prim_type[start] = ref->prim_type();
+		prim_index[start] = ref->prim_index();
+		prim_object[start] = ref->prim_object();
 
 		uint visibility = objects[ref->prim_object()]->visibility;
 		return new LeafNode(ref->bounds(), visibility, start, start+1);
@@ -484,17 +479,9 @@ BVHNode *BVHBuild::create_primitive_leaf_node(const int *p_type,
                                               int num)
 {
 	for(int i = 0; i < num; ++i) {
-		if(start + i == prim_index.size()) {
-			assert(params.use_spatial_split);
-			prim_type.push_back(p_type[i]);
-			prim_index.push_back(p_index[i]);
-			prim_object.push_back(p_object[i]);
-		}
-		else {
-			prim_type[start + i] = p_type[i];
-			prim_index[start + i] = p_index[i];
-			prim_object[start + i] = p_object[i];
-		}
+		prim_type[start + i] = p_type[i];
+		prim_index[start + i] = p_index[i];
+		prim_object[start + i] = p_object[i];
 	}
 	return new LeafNode(bounds, visibility, start, start + num);
 }
@@ -535,6 +522,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
 		}
 	}
 
+	/* Extend an array when needed. */
+	if(prim_type.size() < range.end()) {
+		assert(params.use_spatial_split);
+		/* TODO(sergey): We might want to look into different policies of
+		 * re-allocation here, so on the one hand we would not do as much
+		 * re-allocations and on the other hand will have small memory
+		 * overhead.
+		 */
+		prim_type.resize(range.end());
+		prim_index.resize(range.end());
+		prim_object.resize(range.end());
+	}
+
 	/* Create leaf nodes for every existing primitive. */
 	BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
 	int num_leaves = 0;
@@ -577,17 +577,22 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
 		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
 	}
 	else if(num_leaves == 3) {
-		BoundBox inner_bounds = merge(bounds[1], bounds[2]);
+		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
-	} else /*if(num_leaves == 4)*/ {
+	} else {
 		/* Shpuld be doing more branches if more primitive types added. */
-		assert(num_leaves == 4);
-		BoundBox inner_bounds_a = merge(bounds[0], bounds[1]);
-		BoundBox inner_bounds_b = merge(bounds[2], bounds[3]);
+		assert(num_leaves <= 5);
+		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
 		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
 		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
-		return new InnerNode(range.bounds(), inner_a, inner_b);
+		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
+		if(num_leaves == 5) {
+			return new InnerNode(range.bounds(), inner_c, leaves[4]);
+		}
+		return inner_c;
 	}
 }
 
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 294c5a1758d..eefb7b60f7c 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -42,13 +42,12 @@ class BVHBuild
 {
 public:
 	/* Constructor/Destructor */
-	BVHBuild(
-		const vector<Object*>& objects,
-		vector<int>& prim_type,
-		vector<int>& prim_index,
-		vector<int>& prim_object,
-		const BVHParams& params,
-		Progress& progress);
+	BVHBuild(const vector<Object*>& objects,
+	         array<int>& prim_type,
+	         array<int>& prim_index,
+	         array<int>& prim_object,
+	         const BVHParams& params,
+	         Progress& progress);
 	~BVHBuild();
 
 	BVHNode *run();
@@ -99,9 +98,9 @@ protected:
 	int num_original_references;
 
 	/* output primitive indexes and objects */
-	vector<int>& prim_type;
-	vector<int>& prim_index;
-	vector<int>& prim_object;
+	array<int>& prim_type;
+	array<int>& prim_index;
+	array<int>& prim_object;
 
 	/* build parameters */
 	BVHParams params;
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 1656bb367a4..44f5518229b 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -24,8 +24,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-enum BVH_STAT
-{
+enum BVH_STAT {
 	BVH_STAT_NODE_COUNT,
 	BVH_STAT_INNER_COUNT,
 	BVH_STAT_LEAF_COUNT,
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 99bfd9449da..af8d8eeb3ee 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -113,7 +113,9 @@ public:
 	__forceinline int prim_type() const { return type; }
 
 	BVHReference& operator=(const BVHReference &arg) {
-		memcpy(this, &arg, sizeof(BVHReference));
+		if(&arg != this) {
+			memcpy(this, &arg, sizeof(BVHReference));
+		}
 		return *this;
 	}
 
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index 07c35c08c18..2290c4143ad 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -253,7 +253,7 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 	Object *ob = builder->objects[ref.prim_object()];
 	const Mesh *mesh = ob->mesh;
 
-	if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+	if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
 		const int *inds = mesh->triangles[ref.prim_index()].v;
 		const float3 *verts = &mesh->verts[0];
 		const float3* v1 = &verts[inds[2]];
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 3a33b8fb68b..5cad8e1b49c 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -33,6 +33,13 @@ CCL_NAMESPACE_BEGIN
 
 /* Device */
 
+Device::~Device()
+{
+	if(!background && vertex_buffer != 0) {
+		glDeleteBuffers(1, &vertex_buffer);
+	}
+}
+
 void Device::pixels_alloc(device_memory& mem)
 {
 	mem_alloc(mem, MEM_READ_WRITE);
@@ -51,7 +58,7 @@ void Device::pixels_free(device_memory& mem)
 	mem_free(mem);
 }
 
-void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 	const DeviceDrawParams &draw_params)
 {
 	pixels_copy_from(rgba, y, w, h);
@@ -67,6 +74,9 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 		/* for multi devices, this assumes the inefficient method that we allocate
 		 * all pixels on the device even though we only render to a subset */
 		GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
+		float vbuffer[16], *basep;
+		float *vp = NULL;
+
 		data_pointer += 4*y*w;
 
 		/* draw half float texture, GLSL shader for display transform assumed to be bound */
@@ -83,23 +93,63 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 			draw_params.bind_display_space_shader_cb();
 		}
 
-		glPushMatrix();
-		glTranslatef(0.0f, (float)dy, 0.0f);
+		if(GLEW_VERSION_1_5) {
+			if(!vertex_buffer)
+				glGenBuffers(1, &vertex_buffer);
+
+			glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+			/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+			glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
 
-		glBegin(GL_QUADS);
-		
-		glTexCoord2f(0.0f, 0.0f);
-		glVertex2f(0.0f, 0.0f);
-		glTexCoord2f(1.0f, 0.0f);
-		glVertex2f((float)width, 0.0f);
-		glTexCoord2f(1.0f, 1.0f);
-		glVertex2f((float)width, (float)height);
-		glTexCoord2f(0.0f, 1.0f);
-		glVertex2f(0.0f, (float)height);
+			vp = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
 
-		glEnd();
+			basep = NULL;
+		}
+		else {
+			basep = vbuffer;
+			vp = vbuffer;
+		}
 
-		glPopMatrix();
+		if(vp) {
+			/* texture coordinate - vertex pair */
+			vp[0] = 0.0f;
+			vp[1] = 0.0f;
+			vp[2] = dx;
+			vp[3] = dy;
+
+			vp[4] = 1.0f;
+			vp[5] = 0.0f;
+			vp[6] = (float)width + dx;
+			vp[7] = dy;
+
+			vp[8] = 1.0f;
+			vp[9] = 1.0f;
+			vp[10] = (float)width + dx;
+			vp[11] = (float)height + dy;
+
+			vp[12] = 0.0f;
+			vp[13] = 1.0f;
+			vp[14] = dx;
+			vp[15] = (float)height + dy;
+
+			if(vertex_buffer)
+				glUnmapBuffer(GL_ARRAY_BUFFER);
+		}
+
+		glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), basep);
+		glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), ((char *)basep) + 2 * sizeof(float));
+
+		glEnableClientState(GL_VERTEX_ARRAY);
+		glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+
+		glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+		glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+		glDisableClientState(GL_VERTEX_ARRAY);
+
+		if(vertex_buffer) {
+			glBindBuffer(GL_ARRAY_BUFFER, 0);
+		}
 
 		if(draw_params.unbind_display_space_shader_cb) {
 			draw_params.unbind_display_space_shader_cb();
@@ -113,7 +163,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 		/* fallback for old graphics cards that don't support GLSL, half float,
 		 * and non-power-of-two textures */
 		glPixelZoom((float)width/(float)w, (float)height/(float)h);
-		glRasterPos2f(0, dy);
+		glRasterPos2f(dx, dy);
 
 		uint8_t *pixels = (uint8_t*)rgba.data_pointer;
 
@@ -277,13 +327,10 @@ string Device::device_capabilities()
 #endif
 
 #ifdef WITH_OPENCL
-	/* TODO(sergey): Needs proper usable implementation. */
-	/*
 	if(device_opencl_init()) {
 		capabilities += "\nOpenCL device capabilities:\n";
 		capabilities += device_opencl_capabilities();
 	}
-	*/
 #endif
 
 	return capabilities;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 7c17f7f4112..6b4a190bbf0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -55,6 +55,7 @@ public:
 	bool advanced_shading;
 	bool pack_images;
 	bool extended_images; /* flag for GPU and Multi device */
+	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
 
 	DeviceInfo()
@@ -66,25 +67,76 @@ public:
 		advanced_shading = true;
 		pack_images = false;
 		extended_images = false;
+		use_split_kernel = false;
+	}
+};
+
+class DeviceRequestedFeatures {
+public:
+	/* Use experimental feature set. */
+	bool experimental;
+
+	/* Maximum number of closures in shader trees. */
+	int max_closure;
+
+	/* Selective nodes compilation. */
+
+	/* Identifier of a node group up to which all the nodes needs to be
+	 * compiled in. Nodes from higher group indices will be ignores.
+	 */
+	int max_nodes_group;
+
+	/* Features bitfield indicating which features from the requested group
+	 * will be compiled in. Nodes which corresponds to features which are not
+	 * in this bitfield will be ignored even if they're in the requested group.
+	 */
+	int nodes_features;
+
+	/* BVH/sampling kernel features. */
+	bool use_hair;
+	bool use_object_motion;
+	bool use_camera_motion;
+
+	DeviceRequestedFeatures()
+	{
+		/* TODO(sergey): Find more meaningful defaults. */
+		experimental = false;
+		max_closure = 0;
+		max_nodes_group = 0;
+		nodes_features = 0;
+		use_hair = false;
+		use_object_motion = false;
+		use_camera_motion = false;
+	}
+
+	bool modified(const DeviceRequestedFeatures& requested_features)
+	{
+		return !(experimental == requested_features.experimental &&
+		         max_closure == requested_features.max_closure &&
+		         max_nodes_group == requested_features.max_nodes_group &&
+		         nodes_features == requested_features.nodes_features);
 	}
 };
 
 /* Device */
 
 struct DeviceDrawParams {
-	boost::function<void(void)> bind_display_space_shader_cb;
-	boost::function<void(void)> unbind_display_space_shader_cb;
+	function<void(void)> bind_display_space_shader_cb;
+	function<void(void)> unbind_display_space_shader_cb;
 };
 
 class Device {
 protected:
-	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), info(info_), stats(stats_) {}
+	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
 
 	bool background;
 	string error_msg;
 
+	/* used for real time display */
+	unsigned int vertex_buffer;
+
 public:
-	virtual ~Device() {}
+	virtual ~Device();
 
 	/* info */
 	DeviceInfo info;
@@ -106,9 +158,15 @@ public:
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
 	/* texture memory */
-	virtual void tex_alloc(const char *name, device_memory& mem,
-		InterpolationType interpolation = INTERPOLATION_NONE, bool periodic = false) {};
-	virtual void tex_free(device_memory& mem) {};
+	virtual void tex_alloc(const char * /*name*/,
+	                       device_memory& /*mem*/,
+	                       InterpolationType interpolation = INTERPOLATION_NONE,
+	                       bool periodic = false)
+	{
+		(void)interpolation;  /* Ignored. */
+		(void)periodic;  /* Ignored. */
+	};
+	virtual void tex_free(device_memory& /*mem*/) {};
 
 	/* pixel memory */
 	virtual void pixels_alloc(device_memory& mem);
@@ -119,7 +177,9 @@ public:
 	virtual void *osl_memory() { return NULL; }
 
 	/* load/compile kernels, must be called before adding tasks */ 
-	virtual bool load_kernels(bool experimental) { return true; }
+	virtual bool load_kernels(
+	        const DeviceRequestedFeatures& /*requested_features*/)
+	{ return true; }
 
 	/* tasks */
 	virtual int get_split_task_count(DeviceTask& task) = 0;
@@ -129,7 +189,7 @@ public:
 	
 	/* opengl drawing */
 	virtual void draw_pixels(device_memory& mem, int y, int w, int h,
-		int dy, int width, int height, bool transparent,
+		int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params);
 
 #ifdef WITH_NETWORK
@@ -138,8 +198,8 @@ public:
 #endif
 
 	/* multi device */
-	virtual void map_tile(Device *sub_device, RenderTile& tile) {}
-	virtual int device_number(Device *sub_device) { return 0; }
+	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
+	virtual int device_number(Device * /*sub_device*/) { return 0; }
 
 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 9abcf9167d5..013f656e31c 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -19,6 +19,15 @@
 
 /* So ImathMath is included before our kernel_cpu_compat. */
 #ifdef WITH_OSL
+#  if defined(_MSC_VER)
+/* Prevent OSL from polluting the context with weird macros from windows.h.
+ * TODO(sergey): Ideally it's only enough to have class/struct declarations in
+ * the header and skip header include here.
+ */
+#    define NOGDI
+#    define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#  endif
 #  include <OSL/oslexec.h>
 #endif
 
@@ -38,6 +47,7 @@
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_logging.h"
 #include "util_opengl.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -75,19 +85,21 @@ public:
 		task_pool.stop();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		mem.device_pointer = mem.data_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(device_memory& /*mem*/)
 	{
 		/* no-op */
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(device_memory& /*mem*/,
+	                   int /*y*/, int /*w*/, int /*h*/,
+	                   int /*elem*/)
 	{
 		/* no-op */
 	}
@@ -111,8 +123,9 @@ public:
 		kernel_const_copy(&kernel_globals, name, host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool /*periodic*/)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
 		mem.device_pointer = mem.data_pointer;
 		mem.device_size = mem.memory_size();
@@ -207,7 +220,7 @@ public:
 			int end_sample = tile.start_sample + tile.num_samples;
 
 			for(int sample = start_sample; sample < end_sample; sample++) {
-				if (task.get_cancel() || task_pool.canceled()) {
+				if(task.get_cancel() || task_pool.canceled()) {
 					if(task.need_finish_queue == false)
 						break;
 				}
@@ -368,7 +381,7 @@ public:
 
 	int get_split_task_count(DeviceTask& task)
 	{
-		if (task.type == DeviceTask::SHADER)
+		if(task.type == DeviceTask::SHADER)
 			return task.get_subtask_count(TaskScheduler::num_threads(), 256);
 		else
 			return task.get_subtask_count(TaskScheduler::num_threads());
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 79a1a2b7fe1..4a9c27f5429 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -185,7 +185,7 @@ public:
 		cuda_assert(cuCtxDestroy(cuContext));
 	}
 
-	bool support_device(bool experimental)
+	bool support_device(bool /*experimental*/)
 	{
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
@@ -266,7 +266,7 @@ public:
 			printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* compile */
-		string kernel = path_join(kernel_path, "kernel.cu");
+		string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
 
@@ -281,7 +281,7 @@ public:
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 		
 		if(experimental)
-			command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
+			command += " -D__KERNEL_EXPERIMENTAL__";
 
 		if(getenv("CYCLES_CUDA_EXTRA_CFLAGS")) {
 			command += string(" ") + getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -309,18 +309,18 @@ public:
 		return cubin;
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		/* check if cuda init succeeded */
 		if(cuContext == 0)
 			return false;
 		
 		/* check if GPU is supported */
-		if(!support_device(experimental))
+		if(!support_device(requested_features.experimental))
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(experimental);
+		string cubin = compile_kernel(requested_features.experimental);
 
 		if(cubin == "")
 			return false;
@@ -331,7 +331,7 @@ public:
 		string cubin_data;
 		CUresult result;
 
-		if (path_read_text(cubin, cubin_data))
+		if(path_read_text(cubin, cubin_data))
 			result = cuModuleLoadData(&cuModule, cubin_data.c_str());
 		else
 			result = CUDA_ERROR_FILE_NOT_FOUND;
@@ -344,7 +344,7 @@ public:
 		return (result == CUDA_SUCCESS);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		cuda_push_context();
 		CUdeviceptr device_pointer;
@@ -419,6 +419,7 @@ public:
 	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
 		/* todo: support 3D textures, only CPU for now */
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
 		/* determine format */
 		CUarray_format_enum format;
@@ -483,7 +484,7 @@ public:
 				if(interpolation == INTERPOLATION_CLOSEST) {
 					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
 				}
-				else if (interpolation == INTERPOLATION_LINEAR) {
+				else if(interpolation == INTERPOLATION_LINEAR) {
 					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
 				}
 				else {/* CUBIC and SMART are unsupported for CUDA */
@@ -879,11 +880,12 @@ public:
 		}
 	}
 
-	void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent,
+	void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
+			float *vpointer;
 
 			cuda_push_context();
 
@@ -917,23 +919,52 @@ public:
 				draw_params.bind_display_space_shader_cb();
 			}
 
-			glPushMatrix();
-			glTranslatef(0.0f, (float)dy, 0.0f);
-				
-			glBegin(GL_QUADS);
-			
-			glTexCoord2f(0.0f, 0.0f);
-			glVertex2f(0.0f, 0.0f);
-			glTexCoord2f((float)w/(float)pmem.w, 0.0f);
-			glVertex2f((float)width, 0.0f);
-			glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
-			glVertex2f((float)width, (float)height);
-			glTexCoord2f(0.0f, (float)h/(float)pmem.h);
-			glVertex2f(0.0f, (float)height);
+			if(!vertex_buffer)
+				glGenBuffers(1, &vertex_buffer);
+
+			glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+			/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+			glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+			vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+			if(vpointer) {
+				/* texture coordinate - vertex pair */
+				vpointer[0] = 0.0f;
+				vpointer[1] = 0.0f;
+				vpointer[2] = dx;
+				vpointer[3] = dy;
+
+				vpointer[4] = (float)w/(float)pmem.w;
+				vpointer[5] = 0.0f;
+				vpointer[6] = (float)width + dx;
+				vpointer[7] = dy;
+
+				vpointer[8] = (float)w/(float)pmem.w;
+				vpointer[9] = (float)h/(float)pmem.h;
+				vpointer[10] = (float)width + dx;
+				vpointer[11] = (float)height + dy;
+
+				vpointer[12] = 0.0f;
+				vpointer[13] = (float)h/(float)pmem.h;
+				vpointer[14] = dx;
+				vpointer[15] = (float)height + dy;
+
+				glUnmapBuffer(GL_ARRAY_BUFFER);
+			}
+
+			glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), 0);
+			glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), (char *)NULL + 2 * sizeof(float));
+
+			glEnableClientState(GL_VERTEX_ARRAY);
+			glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+
+			glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-			glEnd();
+			glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+			glDisableClientState(GL_VERTEX_ARRAY);
 
-			glPopMatrix();
+			glBindBuffer(GL_ARRAY_BUFFER, 0);
 
 			if(draw_params.unbind_display_space_shader_cb) {
 				draw_params.unbind_display_space_shader_cb();
@@ -950,7 +981,7 @@ public:
 			return;
 		}
 
-		Device::draw_pixels(mem, y, w, h, dy, width, height, transparent, draw_params);
+		Device::draw_pixels(mem, y, w, h, dx, dy, width, height, transparent, draw_params);
 	}
 
 	void thread_run(DeviceTask *task)
@@ -966,7 +997,7 @@ public:
 				int end_sample = tile.start_sample + tile.num_samples;
 
 				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task->get_cancel()) {
+					if(task->get_cancel()) {
 						if(task->need_finish_queue == false)
 							break;
 					}
@@ -999,7 +1030,7 @@ public:
 		}
 	};
 
-	int get_split_task_count(DeviceTask& task)
+	int get_split_task_count(DeviceTask& /*task*/)
 	{
 		return 1;
 	}
@@ -1035,12 +1066,12 @@ bool device_cuda_init(void)
 	static bool initialized = false;
 	static bool result = false;
 
-	if (initialized)
+	if(initialized)
 		return result;
 
 	initialized = true;
 	int cuew_result = cuewInit();
-	if (cuew_result == CUEW_SUCCESS) {
+	if(cuew_result == CUEW_SUCCESS) {
 		VLOG(1) << "CUEW initialization succeeded";
 		if(CUDADevice::have_precompiled_kernels()) {
 			VLOG(1) << "Found precompiled  kernels";
@@ -1091,14 +1122,20 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 	}
 	
 	vector<DeviceInfo> display_devices;
-	
+
 	for(int num = 0; num < count; num++) {
 		char name[256];
 		int attr;
-		
+
 		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
 			continue;
 
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, num);
+		if(major < 2) {
+			continue;
+		}
+
 		DeviceInfo info;
 
 		info.type = DEVICE_CUDA;
@@ -1106,8 +1143,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.id = string_printf("CUDA_%d", num);
 		info.num = num;
 
-		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, num);
 		info.advanced_shading = (major >= 2);
 		info.extended_images = (major >= 3);
 		info.pack_images = false;
@@ -1132,7 +1167,7 @@ string device_cuda_capabilities(void)
 		if(result != CUDA_ERROR_NO_DEVICE) {
 			return string("Error initializing CUDA: ") + cuewErrorString(result);
 		}
-		return "No CUDA device found";
+		return "No CUDA device found\n";
 	}
 
 	int count;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 9aac86daa1d..c61e550151f 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -25,6 +25,7 @@
 
 #include "util_foreach.h"
 #include "util_list.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_time.h"
 
@@ -88,10 +89,10 @@ public:
 		return error_msg;
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		foreach(SubDevice& sub, devices)
-			if(!sub.device->load_kernels(experimental))
+			if(!sub.device->load_kernels(requested_features))
 				return false;
 
 		return true;
@@ -170,6 +171,8 @@ public:
 
 	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
 			sub.device->tex_alloc(name, mem, interpolation, periodic);
@@ -233,7 +236,7 @@ public:
 		mem.device_pointer = tmp;
 	}
 
-	void draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+	void draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
 		device_ptr tmp = rgba.device_pointer;
@@ -248,7 +251,7 @@ public:
 			/* adjust math for w/width */
 
 			rgba.device_pointer = sub.ptr_map[tmp];
-			sub.device->draw_pixels(rgba, sy, w, sh, sdy, width, sheight, transparent, draw_params);
+			sub.device->draw_pixels(rgba, sy, w, sh, dx, sdy, width, sheight, transparent, draw_params);
 			i++;
 		}
 
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 4733482bf4e..1d6066c94dc 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -19,6 +19,7 @@
 #include "device_network.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 #if defined(WITH_NETWORK)
 
@@ -164,6 +165,8 @@ public:
 
 	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
@@ -194,7 +197,7 @@ public:
 		}
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		if(error_func.have_error())
 			return false;
@@ -202,7 +205,10 @@ public:
 		thread_scoped_lock lock(rpc_lock);
 
 		RPCSend snd(socket, &error_func, "load_kernels");
-		snd.add(experimental);
+		snd.add(requested_features.experimental);
+		snd.add(requested_features.max_closure);
+		snd.add(requested_features.max_nodes_group);
+		snd.add(requested_features.nodes_features);
 		snd.write();
 
 		bool result;
@@ -269,7 +275,7 @@ public:
 				lock.unlock();
 
 				TileList::iterator it = tile_list_find(the_tiles, tile);
-				if (it != the_tiles.end()) {
+				if(it != the_tiles.end()) {
 					tile.buffers = it->buffers;
 					the_tiles.erase(it);
 				}
@@ -605,11 +611,14 @@ protected:
 			device->tex_free(mem);
 		}
 		else if(rcv.name == "load_kernels") {
-			bool experimental;
-			rcv.read(experimental);
+			DeviceRequestedFeatures requested_features;
+			rcv.read(requested_features.experimental);
+			rcv.read(requested_features.max_closure);
+			rcv.read(requested_features.max_nodes_group);
+			rcv.read(requested_features.nodes_features);
 
 			bool result;
-			result = device->load_kernels(experimental);
+			result = device->load_kernels(requested_features);
 			RPCSend snd(socket, &error_func, "load_kernels");
 			snd.add(result);
 			snd.write();
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index a5bf35a63c8..7ab2b412f0d 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -28,6 +28,7 @@
 #include "buffers.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_math.h"
 #include "util_md5.h"
@@ -39,7 +40,39 @@ CCL_NAMESPACE_BEGIN
 
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
-static cl_device_type opencl_device_type()
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+struct OpenCLPlatformDevice {
+	OpenCLPlatformDevice(cl_platform_id platform_id,
+	                     cl_device_id device_id)
+	  : platform_id(platform_id), device_id(device_id) {}
+	cl_platform_id platform_id;
+	cl_device_id device_id;
+};
+
+namespace {
+
+cl_device_type opencl_device_type()
 {
 	char *device = getenv("CYCLES_OPENCL_TEST");
 
@@ -59,12 +92,12 @@ static cl_device_type opencl_device_type()
 	return CL_DEVICE_TYPE_ALL;
 }
 
-static bool opencl_kernel_use_debug()
+bool opencl_kernel_use_debug()
 {
 	return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
 }
 
-static bool opencl_kernel_use_advanced_shading(const string& platform)
+bool opencl_kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
 	if(platform == "NVIDIA CUDA")
@@ -72,44 +105,182 @@ static bool opencl_kernel_use_advanced_shading(const string& platform)
 	else if(platform == "Apple")
 		return false;
 	else if(platform == "AMD Accelerated Parallel Processing")
-		return false;
+		return true;
 	else if(platform == "Intel(R) OpenCL")
 		return true;
 
 	return false;
 }
 
-static string opencl_kernel_build_options(const string& platform, const string *debug_src = NULL)
+bool opencl_kernel_use_split(const string& platform_name,
+                             const cl_device_type device_type)
 {
-	string build_options = " -cl-fast-relaxed-math ";
-
-	if(platform == "NVIDIA CUDA")
-		build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
-
-	else if(platform == "Apple")
-		build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+	if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
+		return true;
+	}
+	/* TODO(sergey): Replace string lookups with more enum-like API,
+	 * similar to device/vendor checks blender's gpu.
+	 */
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	return false;
+}
 
-	else if(platform == "AMD Accelerated Parallel Processing")
-		build_options += "-D__KERNEL_OPENCL_AMD__ ";
+bool opencl_device_supported(const string& platform_name,
+                             const cl_device_id device_id)
+{
+	cl_device_type device_type;
+	clGetDeviceInfo(device_id,
+	                CL_DEVICE_TYPE,
+	                sizeof(cl_device_type),
+	                &device_type,
+	                NULL);
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;;
+	}
+	return false;
+}
 
-	else if(platform == "Intel(R) OpenCL") {
-		build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
+bool opencl_platform_version_check(cl_platform_id platform,
+                                   string *error = NULL)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetPlatformInfo(platform,
+	                  CL_PLATFORM_VERSION,
+	                  sizeof(version),
+	                  &version,
+	                  NULL);
+	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
 
-		/* options for gdb source level kernel debugging. this segfaults on linux currently */
-		if(opencl_kernel_use_debug() && debug_src)
-			build_options += "-g -s \"" + *debug_src + "\"";
+bool opencl_device_version_check(cl_device_id device,
+                                 string *error = NULL)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetDeviceInfo(device,
+	                CL_DEVICE_OPENCL_C_VERSION,
+	                sizeof(version),
+	                &version,
+	                NULL);
+	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
 	}
+	return true;
+}
 
-	if(opencl_kernel_use_debug())
-		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
+{
+	const bool force_all_platforms =
+	        (getenv("CYCLES_OPENCL_TEST") != NULL) ||
+	        (getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST")) != NULL;
+	const cl_device_type device_type = opencl_device_type();
 
-#ifdef WITH_CYCLES_DEBUG
-	build_options += "-D__KERNEL_DEBUG__ ";
-#endif
+	vector<cl_device_id> device_ids;
+	cl_uint num_devices = 0;
+	vector<cl_platform_id> platform_ids;
+	cl_uint num_platforms = 0;
+
+	/* Number of the devices added to the device info list. */
+	cl_uint num_added_devices = 0;
 
-	return build_options;
+	/* Get devices. */
+	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
+	   num_platforms == 0)
+	{
+		return;
+	}
+	platform_ids.resize(num_platforms);
+	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
+		return;
+	}
+	/* Devices are numbered consecutively across platforms. */
+	int num_base = 0;
+	for(int platform = 0;
+	    platform < num_platforms;
+	    platform++, num_base += num_added_devices)
+	{
+		cl_platform_id platform_id = platform_ids[platform];
+		num_devices = num_added_devices = 0;
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  0,
+		                  NULL,
+		                  &num_devices) != CL_SUCCESS || num_devices == 0)
+		{
+			continue;
+		}
+		device_ids.resize(num_devices);
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  num_devices,
+		                  &device_ids[0],
+		                  NULL) != CL_SUCCESS)
+		{
+			continue;
+		}
+		if(!opencl_platform_version_check(platform_ids[platform])) {
+			continue;
+		}
+		char pname[256];
+		clGetPlatformInfo(platform_id,
+		                  CL_PLATFORM_NAME,
+		                  sizeof(pname),
+		                  &pname,
+		                  NULL);
+		string platform_name = pname;
+		for(int num = 0; num < num_devices; num++) {
+			cl_device_id device_id = device_ids[num];
+			if(!opencl_device_version_check(device_id)) {
+				continue;
+			}
+			if(force_all_platforms ||
+			   opencl_device_supported(platform_name, device_id))
+			{
+				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+				                                               device_id));
+			}
+		}
+	}
 }
 
+}  /* namespace */
+
 /* thread safe cache for contexts and programs */
 class OpenCLCache
 {
@@ -117,14 +288,18 @@ class OpenCLCache
 	{
 		thread_mutex *mutex;
 		cl_context context;
-		cl_program program;
+		/* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
+		cl_program ocl_dev_base_program;
+		/* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
+		cl_program ocl_dev_megakernel_program;
 
-		Slot() : mutex(NULL), context(NULL), program(NULL) {}
+		Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {}
 
 		Slot(const Slot &rhs)
 			: mutex(rhs.mutex)
 			, context(rhs.context)
-			, program(rhs.program)
+			, ocl_dev_base_program(rhs.ocl_dev_base_program)
+			, ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
 		{
 			/* copy can only happen in map insert, assert that */
 			assert(mutex == NULL);
@@ -235,6 +410,12 @@ class OpenCLCache
 	}
 
 public:
+
+	enum ProgramName {
+		OCL_DEV_BASE_PROGRAM,
+		OCL_DEV_MEGAKERNEL_PROGRAM,
+	};
+
 	/* see get_something comment */
 	static cl_context get_context(cl_platform_id platform, cl_device_id device,
 		thread_scoped_lock &slot_locker)
@@ -253,10 +434,21 @@ public:
 	}
 
 	/* see get_something comment */
-	static cl_program get_program(cl_platform_id platform, cl_device_id device,
+	static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name,
 		thread_scoped_lock &slot_locker)
 	{
-		cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
+		cl_program program = NULL;
+
+		if(program_name == OCL_DEV_BASE_PROGRAM) {
+			/* Get program related to OpenCLDeviceBase */
+			program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker);
+		}
+		else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+			/* Get program related to megakernel */
+			program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker);
+		} else {
+			assert(!"Invalid program name");
+		}
 
 		if(!program)
 			return NULL;
@@ -283,10 +475,18 @@ public:
 	}
 
 	/* see store_something comment */
-	static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
+	static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name,
 		thread_scoped_lock &slot_locker)
 	{
-		store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
+		if(program_name == OCL_DEV_BASE_PROGRAM) {
+			store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker);
+		}
+		else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+			store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker);
+		} else {
+			assert(!"Invalid program name\n");
+			return;
+		}
 
 		/* increment reference count in OpenCL.
 		 * The caller is going to release the object when done with it. */
@@ -303,8 +503,10 @@ public:
 		thread_scoped_lock cache_lock(self.cache_lock);
 
 		foreach(CacheMap::value_type &item, self.cache) {
-			if(item.second.program != NULL)
-				clReleaseProgram(item.second.program);
+			if(item.second.ocl_dev_base_program != NULL)
+				clReleaseProgram(item.second.ocl_dev_base_program);
+			if(item.second.ocl_dev_megakernel_program != NULL)
+				clReleaseProgram(item.second.ocl_dev_megakernel_program);
 			if(item.second.context != NULL)
 				clReleaseContext(item.second.context);
 		}
@@ -313,7 +515,7 @@ public:
 	}
 };
 
-class OpenCLDevice : public Device
+class OpenCLDeviceBase : public Device
 {
 public:
 	DedicatedTaskPool task_pool;
@@ -322,7 +524,6 @@ public:
 	cl_platform_id cpPlatform;
 	cl_device_id cdDevice;
 	cl_program cpProgram;
-	cl_kernel ckPathTraceKernel;
 	cl_kernel ckFilmConvertByteKernel;
 	cl_kernel ckFilmConvertHalfFloatKernel;
 	cl_kernel ckShaderKernel;
@@ -384,7 +585,7 @@ public:
 		}
 	}
 
-	OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
+	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_)
 	{
 		cpPlatform = NULL;
@@ -392,7 +593,6 @@ public:
 		cxContext = NULL;
 		cqCommandQueue = NULL;
 		cpProgram = NULL;
-		ckPathTraceKernel = NULL;
 		ckFilmConvertByteKernel = NULL;
 		ckFilmConvertHalfFloatKernel = NULL;
 		ckShaderKernel = NULL;
@@ -400,71 +600,19 @@ public:
 		null_mem = 0;
 		device_initialized = false;
 
-		/* setup platform */
-		cl_uint num_platforms;
-
-		ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
-		if(opencl_error(ciErr))
-			return;
-
-		if(num_platforms == 0) {
-			opencl_error("OpenCL: no platforms found.");
-			return;
-		}
-
-		vector<cl_platform_id> platforms(num_platforms, NULL);
-
-		ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
-		if(opencl_error(ciErr)) {
-			fprintf(stderr, "clGetPlatformIDs failed \n");
-			return;
-		}
-
-		int num_base = 0;
-		int total_devices = 0;
-
-		for (int platform = 0; platform < num_platforms; platform++) {
-			cl_uint num_devices;
-
-			if(opencl_error(clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices)))
-				return;
-
-			total_devices += num_devices;
-
-			if(info.num - num_base >= num_devices) {
-				/* num doesn't refer to a device in this platform */
-				num_base += num_devices;
-				continue;
-			}
-
-			/* device is in this platform */
-			cpPlatform = platforms[platform];
-
-			/* get devices */
-			vector<cl_device_id> device_ids(num_devices, NULL);
-
-			if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) {
-				fprintf(stderr, "clGetDeviceIDs failed \n");
-				return;
-			}
-
-			cdDevice = device_ids[info.num - num_base];
-
-			char name[256];
-			clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
-			platform_name = name;
-
-			break;
-		}
-
-		if(total_devices == 0) {
+		vector<OpenCLPlatformDevice> usable_devices;
+		opencl_get_usable_devices(&usable_devices);
+		if(usable_devices.size() == 0) {
 			opencl_error("OpenCL: no devices found.");
 			return;
 		}
-		else if(!cdDevice) {
-			opencl_error("OpenCL: specified device not found.");
-			return;
-		}
+		assert(info.num < usable_devices.size());
+		OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+		cpPlatform = platform_device.platform_id;
+		cdDevice = platform_device.device_id;
+		char name[256];
+		clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+		platform_name = name;
 
 		{
 			/* try to use cached context */
@@ -500,12 +648,12 @@ public:
 		if(opencl_error(ciErr))
 			return;
 
-		fprintf(stderr,"Device init succes\n");
+		fprintf(stderr, "Device init success\n");
 		device_initialized = true;
 	}
 
 	static void CL_CALLBACK context_notify_callback(const char *err_info,
-		const void *private_info, size_t cb, void *user_data)
+		const void * /*private_info*/, size_t /*cb*/, void *user_data)
 	{
 		char name[256];
 		clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
@@ -515,38 +663,23 @@ public:
 
 	bool opencl_version_check()
 	{
-		char version[256];
-
-		int major, minor, req_major = 1, req_minor = 1;
-
-		clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-
-		if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-			opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
-			return false;
-		}
-
-		if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-			opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
-			return false;
-		}
-
-		clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-
-		if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
-			opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
+		string error;
+		if(!opencl_platform_version_check(cpPlatform, &error)) {
+			opencl_error(error);
 			return false;
 		}
-
-		if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-			opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
+		if(!opencl_device_version_check(cdDevice, &error)) {
+			opencl_error(error);
 			return false;
 		}
-
 		return true;
 	}
 
-	bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
+	bool load_binary(const string& /*kernel_path*/,
+	                 const string& clbin,
+	                 string custom_kernel_build_options,
+	                 cl_program *program,
+	                 const string *debug_src = NULL)
 	{
 		/* read binary into memory */
 		vector<uint8_t> binary;
@@ -561,7 +694,7 @@ public:
 		size_t size = binary.size();
 		const uint8_t *bytes = &binary[0];
 
-		cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
+		*program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
 			&size, &bytes, &status, &ciErr);
 
 		if(opencl_error(status) || opencl_error(ciErr)) {
@@ -569,16 +702,16 @@ public:
 			return false;
 		}
 
-		if(!build_kernel(kernel_path, debug_src))
+		if(!build_kernel(program, custom_kernel_build_options, debug_src))
 			return false;
 
 		return true;
 	}
 
-	bool save_binary(const string& clbin)
+	bool save_binary(cl_program *program, const string& clbin)
 	{
 		size_t size = 0;
-		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+		clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
 
 		if(!size)
 			return false;
@@ -586,7 +719,7 @@ public:
 		vector<uint8_t> binary(size);
 		uint8_t *bytes = &binary[0];
 
-		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+		clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
 
 		if(!path_write_binary(clbin, binary)) {
 			opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
@@ -596,24 +729,30 @@ public:
 		return true;
 	}
 
-	bool build_kernel(const string& kernel_path, const string *debug_src = NULL)
+	bool build_kernel(cl_program *kernel_program,
+	                  string custom_kernel_build_options,
+	                  const string *debug_src = NULL)
 	{
-		string build_options = opencl_kernel_build_options(platform_name, debug_src);
-	
-		ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
+		string build_options;
+		build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
+
+		ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
 
 		/* show warnings even if build is successful */
 		size_t ret_val_size = 0;
 
-		clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+		clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
 
 		if(ret_val_size > 1) {
-			vector<char> build_log(ret_val_size+1);
-			clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+			vector<char> build_log(ret_val_size + 1);
+			clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
 
 			build_log[ret_val_size] = '\0';
-			fprintf(stderr, "OpenCL kernel build output:\n");
-			fprintf(stderr, "%s\n", &build_log[0]);
+			/* Skip meaningless empty output from the NVidia compiler. */
+			if(!(ret_val_size == 2 && build_log[0] == '\n')) {
+				fprintf(stderr, "OpenCL kernel build output:\n");
+				fprintf(stderr, "%s\n", &build_log[0]);
+			}
 		}
 
 		if(ciErr != CL_SUCCESS) {
@@ -624,12 +763,15 @@ public:
 		return true;
 	}
 
-	bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
+	bool compile_kernel(const string& kernel_path,
+	                    string source,
+	                    string custom_kernel_build_options,
+	                    cl_program *kernel_program,
+	                    const string *debug_src = NULL)
 	{
 		/* we compile kernels consisting of many files. unfortunately opencl
 		 * kernel caches do not seem to recognize changes in included files.
 		 * so we force recompile on changes by adding the md5 hash of all files */
-		string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
 		source = path_source_replace_includes(source, kernel_path);
 
 		if(debug_src)
@@ -638,15 +780,19 @@ public:
 		size_t source_len = source.size();
 		const char *source_str = source.c_str();
 
-		cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
+		*kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
 
 		if(opencl_error(ciErr))
 			return false;
 
 		double starttime = time_dt();
 		printf("Compiling OpenCL kernel ...\n");
+		/* TODO(sergey): Report which kernel is being compiled
+		 * as well (megakernel or which of split kernels etc..).
+		 */
+		printf("Build flags: %s\n", custom_kernel_build_options.c_str());
 
-		if(!build_kernel(kernel_path, debug_src))
+		if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
 			return false;
 
 		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
@@ -654,7 +800,7 @@ public:
 		return true;
 	}
 
-	string device_md5_hash()
+	string device_md5_hash(string kernel_custom_build_options = "")
 	{
 		MD5Hash md5;
 		char version[256], driver[256], name[256], vendor[256];
@@ -669,13 +815,14 @@ public:
 		md5.append((uint8_t*)name, strlen(name));
 		md5.append((uint8_t*)driver, strlen(driver));
 
-		string options = opencl_kernel_build_options(platform_name);
+		string options = kernel_build_options();
+		options += kernel_custom_build_options;
 		md5.append((uint8_t*)options.c_str(), options.size());
 
 		return md5.get_hex();
 	}
 
-	bool load_kernels(bool experimental)
+	bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/)
 	{
 		/* verify if device was initialized */
 		if(!device_initialized) {
@@ -685,7 +832,7 @@ public:
 
 		/* try to use cached kernel */
 		thread_scoped_lock cache_locker;
-		cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
+		cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
 
 		if(!cpProgram) {
 			/* verify we have right opencl version */
@@ -711,28 +858,27 @@ public:
 			}
 
 			/* if exists already, try use it */
-			if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
+			if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) {
 				/* kernel loaded from binary */
 			}
 			else {
+
+				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
+
 				/* if does not exist or loading binary failed, compile kernel */
-				if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+				if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src))
 					return false;
 
 				/* save binary for reuse */
-				if(!save_binary(clbin))
+				if(!save_binary(&cpProgram, clbin))
 					return false;
 			}
 
 			/* cache the program */
-			OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
+			OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
 		}
 
 		/* find kernels */
-		ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-
 		ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
 		if(opencl_error(ciErr))
 			return false;
@@ -752,7 +898,7 @@ public:
 		return true;
 	}
 
-	~OpenCLDevice()
+	~OpenCLDeviceBase()
 	{
 		task_pool.stop();
 
@@ -765,12 +911,14 @@ public:
 			delete mt->second;
 		}
 
-		if(ckPathTraceKernel)
-			clReleaseKernel(ckPathTraceKernel);  
 		if(ckFilmConvertByteKernel)
 			clReleaseKernel(ckFilmConvertByteKernel);  
 		if(ckFilmConvertHalfFloatKernel)
 			clReleaseKernel(ckFilmConvertHalfFloatKernel);  
+		if(ckShaderKernel)
+			clReleaseKernel(ckShaderKernel);
+		if(ckBakeKernel)
+			clReleaseKernel(ckBakeKernel);
 		if(cpProgram)
 			clReleaseProgram(cpProgram);
 		if(cqCommandQueue)
@@ -854,8 +1002,12 @@ public:
 		mem_copy_to(*i->second);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType /*interpolation*/,
+	               bool /*periodic*/)
 	{
+		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		mem_alloc(mem, MEM_READ_ONLY);
 		mem_copy_to(mem);
 		assert(mem_map.find(name) == mem_map.end());
@@ -908,42 +1060,6 @@ public:
 		opencl_assert(clFlush(cqCommandQueue));
 	}
 
-	void path_trace(RenderTile& rtile, int sample)
-	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_sample = sample;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* sample arguments */
-		cl_uint narg = 0;
-
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
-
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
-#include "kernel_textures.h"
-
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
-	}
-
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
 	{
 		cl_mem ptr;
@@ -974,29 +1090,30 @@ public:
 		cl_int d_offset = task.offset;
 		cl_int d_stride = task.stride;
 
-		/* sample arguments */
-		cl_uint narg = 0;
-
 
 		cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
 
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
+		cl_uint start_arg_index =
+			kernel_set_args(ckFilmConvertKernel,
+			                0,
+			                d_data,
+			                d_rgba,
+			                d_buffer);
 
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
+	set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 #include "kernel_textures.h"
-
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckFilmConvertKernel,
+		                                   start_arg_index,
+		                                   d_sample_scale,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
 
 		enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 	}
@@ -1012,9 +1129,6 @@ public:
 		cl_int d_shader_w = task.shader_w;
 		cl_int d_offset = task.offset;
 
-		/* sample arguments */
-		cl_uint narg = 0;
-
 		cl_kernel kernel;
 
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE)
@@ -1029,19 +1143,25 @@ public:
 
 			cl_int d_sample = sample;
 
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+			cl_uint start_arg_index =
+				kernel_set_args(kernel,
+				                0,
+				                d_data,
+				                d_input,
+				                d_output);
 
 #define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(kernel, &narg, #name);
+		set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
+#undef KERNEL_TEX
 
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
-			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_shader_eval_type,
+			                                   d_shader_x,
+			                                   d_shader_w,
+			                                   d_offset,
+			                                   d_sample);
 
 			enqueue_kernel(kernel, task.shader_w, 1);
 
@@ -1049,6 +1169,380 @@ public:
 		}
 	}
 
+	class OpenCLDeviceTask : public DeviceTask {
+	public:
+		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+		: DeviceTask(task)
+		{
+			run = function_bind(&OpenCLDeviceBase::thread_run,
+			                    device,
+			                    this);
+		}
+	};
+
+	int get_split_task_count(DeviceTask& /*task*/)
+	{
+		return 1;
+	}
+
+	void task_add(DeviceTask& task)
+	{
+		task_pool.push(new OpenCLDeviceTask(this, task));
+	}
+
+	void task_wait()
+	{
+		task_pool.wait();
+	}
+
+	void task_cancel()
+	{
+		task_pool.cancel();
+	}
+
+	virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+
+	string kernel_build_options(const string *debug_src = NULL)
+	{
+		string build_options = " -cl-fast-relaxed-math ";
+
+		if(platform_name == "NVIDIA CUDA") {
+			build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
+			                 "-cl-nv-maxrregcount=32 "
+			                 "-cl-nv-verbose ";
+
+			uint compute_capability_major, compute_capability_minor;
+			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+			                sizeof(cl_uint), &compute_capability_major, NULL);
+			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+			                sizeof(cl_uint), &compute_capability_minor, NULL);
+
+			build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+			                               compute_capability_major * 100 +
+			                               compute_capability_minor * 10);
+		}
+
+		else if(platform_name == "Apple")
+			build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+		else if(platform_name == "AMD Accelerated Parallel Processing")
+			build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+		else if(platform_name == "Intel(R) OpenCL") {
+			build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+			/* Options for gdb source level kernel debugging.
+			 * this segfaults on linux currently.
+			 */
+			if(opencl_kernel_use_debug() && debug_src)
+				build_options += "-g -s \"" + *debug_src + "\" ";
+		}
+
+		if(opencl_kernel_use_debug())
+			build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+
+#ifdef WITH_CYCLES_DEBUG
+		build_options += "-D__KERNEL_DEBUG__ ";
+#endif
+
+		return build_options;
+	}
+
+	class ArgumentWrapper {
+	public:
+		ArgumentWrapper() : size(0), pointer(NULL) {}
+		template <typename T>
+		ArgumentWrapper(T& argument) : size(sizeof(argument)),
+		                               pointer(&argument) { }
+		size_t size;
+		void *pointer;
+	};
+
+	/* TODO(sergey): In the future we can use variadic templates, once
+	 * C++0x is allowed. Should allow to clean this up a bit.
+	 */
+	int kernel_set_args(cl_kernel kernel,
+	                    int start_argument_index,
+	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg33 = ArgumentWrapper())
+	{
+		int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+		do { \
+			if(arg.pointer != NULL) { \
+				opencl_assert(clSetKernelArg( \
+					kernel, \
+					start_argument_index + current_arg_index, \
+					arg.size, arg.pointer)); \
+				++current_arg_index; \
+			} \
+			else { \
+				return current_arg_index; \
+			} \
+		} while(false)
+		FAKE_VARARG_HANDLE_ARG(arg1);
+		FAKE_VARARG_HANDLE_ARG(arg2);
+		FAKE_VARARG_HANDLE_ARG(arg3);
+		FAKE_VARARG_HANDLE_ARG(arg4);
+		FAKE_VARARG_HANDLE_ARG(arg5);
+		FAKE_VARARG_HANDLE_ARG(arg6);
+		FAKE_VARARG_HANDLE_ARG(arg7);
+		FAKE_VARARG_HANDLE_ARG(arg8);
+		FAKE_VARARG_HANDLE_ARG(arg9);
+		FAKE_VARARG_HANDLE_ARG(arg10);
+		FAKE_VARARG_HANDLE_ARG(arg11);
+		FAKE_VARARG_HANDLE_ARG(arg12);
+		FAKE_VARARG_HANDLE_ARG(arg13);
+		FAKE_VARARG_HANDLE_ARG(arg14);
+		FAKE_VARARG_HANDLE_ARG(arg15);
+		FAKE_VARARG_HANDLE_ARG(arg16);
+		FAKE_VARARG_HANDLE_ARG(arg17);
+		FAKE_VARARG_HANDLE_ARG(arg18);
+		FAKE_VARARG_HANDLE_ARG(arg19);
+		FAKE_VARARG_HANDLE_ARG(arg20);
+		FAKE_VARARG_HANDLE_ARG(arg21);
+		FAKE_VARARG_HANDLE_ARG(arg22);
+		FAKE_VARARG_HANDLE_ARG(arg23);
+		FAKE_VARARG_HANDLE_ARG(arg24);
+		FAKE_VARARG_HANDLE_ARG(arg25);
+		FAKE_VARARG_HANDLE_ARG(arg26);
+		FAKE_VARARG_HANDLE_ARG(arg27);
+		FAKE_VARARG_HANDLE_ARG(arg28);
+		FAKE_VARARG_HANDLE_ARG(arg29);
+		FAKE_VARARG_HANDLE_ARG(arg30);
+		FAKE_VARARG_HANDLE_ARG(arg31);
+		FAKE_VARARG_HANDLE_ARG(arg32);
+		FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+		return current_arg_index;
+	}
+
+	inline void release_kernel_safe(cl_kernel kernel)
+	{
+		if(kernel) {
+			clReleaseKernel(kernel);
+		}
+	}
+
+	inline void release_mem_object_safe(cl_mem mem)
+	{
+		if(mem != NULL) {
+			clReleaseMemObject(mem);
+		}
+	}
+
+	inline void release_program_safe(cl_program program)
+	{
+		if(program) {
+			clReleaseProgram(program);
+		}
+	}
+
+	string build_options_from_requested_features(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		string build_options = "";
+		if(requested_features.experimental) {
+			build_options += " -D__KERNEL_EXPERIMENTAL__";
+		}
+		build_options += " -D__NODES_MAX_GROUP__=" +
+			string_printf("%d", requested_features.max_nodes_group);
+		build_options += " -D__NODES_FEATURES__=" +
+			string_printf("%d", requested_features.nodes_features);
+		build_options += string_printf(" -D__MAX_CLOSURE__=%d",
+		                               requested_features.max_closure);
+		if(!requested_features.use_hair) {
+			build_options += " -D__NO_HAIR__";
+		}
+		if(!requested_features.use_object_motion) {
+			build_options += " -D__NO_OBJECT_MOTION__";
+		}
+		if(!requested_features.use_camera_motion) {
+			build_options += " -D__NO_CAMERA_MOTION__";
+		}
+		return build_options;
+	}
+};
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+	cl_kernel ckPathTraceKernel;
+	cl_program path_trace_program;
+
+	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_)
+	{
+		ckPathTraceKernel = NULL;
+		path_trace_program = NULL;
+	}
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
+	{
+		/* Get Shader, bake and film convert kernels.
+		 * It'll also do verification of OpenCL actually initialized.
+		 */
+		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+			return false;
+		}
+
+		/* Try to use cached kernel. */
+		thread_scoped_lock cache_locker;
+		path_trace_program = OpenCLCache::get_program(cpPlatform,
+		                                              cdDevice,
+		                                              OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+		                                              cache_locker);
+
+		if(!path_trace_program) {
+			/* Verify we have right opencl version. */
+			if(!opencl_version_check())
+				return false;
+
+			/* Calculate md5 hash to detect changes. */
+			string kernel_path = path_get("kernel");
+			string kernel_md5 = path_files_md5_hash(kernel_path);
+			string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
+			string device_md5 = device_md5_hash(custom_kernel_build_options);
+
+			/* Path to cached binary. */
+			string clbin = string_printf("cycles_kernel_%s_%s.clbin",
+			                             device_md5.c_str(),
+			                             kernel_md5.c_str());
+			clbin = path_user_get(path_join("cache", clbin));
+
+			/* Path to preprocessed source for debugging. */
+			string clsrc, *debug_src = NULL;
+			if(opencl_kernel_use_debug()) {
+				clsrc = string_printf("cycles_kernel_%s_%s.cl",
+				                      device_md5.c_str(),
+				                      kernel_md5.c_str());
+				clsrc = path_user_get(path_join("cache", clsrc));
+				debug_src = &clsrc;
+			}
+
+			/* If exists already, try use it. */
+			if(path_exists(clbin) && load_binary(kernel_path,
+			                                     clbin,
+			                                     custom_kernel_build_options,
+			                                     &path_trace_program,
+			                                     debug_src)) {
+				/* Kernel loaded from binary, nothing to do. */
+			}
+			else {
+				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
+				                            kernel_md5 + "\n";
+				/* If does not exist or loading binary failed, compile kernel. */
+				if(!compile_kernel(kernel_path,
+				                   init_kernel_source,
+				                   custom_kernel_build_options,
+				                   &path_trace_program,
+				                   debug_src))
+				{
+					return false;
+				}
+				/* Save binary for reuse. */
+				if(!save_binary(&path_trace_program, clbin)) {
+					return false;
+				}
+			}
+			/* Cache the program. */
+			OpenCLCache::store_program(cpPlatform,
+			                           cdDevice,
+			                           path_trace_program,
+			                           OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+			                           cache_locker);
+		}
+
+		/* Find kernels. */
+		ckPathTraceKernel = clCreateKernel(path_trace_program,
+		                                   "kernel_ocl_path_trace",
+		                                   &ciErr);
+		if(opencl_error(ciErr))
+			return false;
+		return true;
+	}
+
+	~OpenCLDeviceMegaKernel()
+	{
+		task_pool.stop();
+		release_kernel_safe(ckPathTraceKernel);
+		release_program_safe(path_trace_program);
+	}
+
+	void path_trace(RenderTile& rtile, int sample)
+	{
+		/* Cast arguments to cl types. */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Sample arguments. */
+		cl_int d_sample = sample;
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel,
+			                0,
+			                d_data,
+			                d_buffer,
+			                d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckPathTraceKernel,
+		                                   start_arg_index,
+		                                   d_sample,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
+
+		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+	}
+
 	void thread_run(DeviceTask *task)
 	{
 		if(task->type == DeviceTask::FILM_CONVERT) {
@@ -1059,8 +1553,7 @@ public:
 		}
 		else if(task->type == DeviceTask::PATH_TRACE) {
 			RenderTile tile;
-			
-			/* keep rendering tiles until done */
+			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
 				int start_sample = tile.start_sample;
 				int end_sample = tile.start_sample + tile.num_samples;
@@ -1078,127 +1571,1947 @@ public:
 					task->update_progress(&tile);
 				}
 
+				/* Complete kernel execution before release tile */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
 				task->release_tile(tile);
 			}
 		}
 	}
+};
 
-	class OpenCLDeviceTask : public DeviceTask {
-	public:
-		OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
-		: DeviceTask(task)
+/* TODO(sergey): This is to keep tile split on OpenCL level working
+ * for now, since without this view-port render does not work as it
+ * should.
+ *
+ * Ideally it'll be done on the higher level, but we need to get ready
+ * for merge rather soon, so let's keep split logic private here in
+ * the file.
+ */
+class SplitRenderTile : public RenderTile {
+public:
+	SplitRenderTile()
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0) {}
+
+	explicit SplitRenderTile(RenderTile& tile)
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0)
+	{
+		x = tile.x;
+		y = tile.y;
+		w = tile.w;
+		h = tile.h;
+		start_sample = tile.start_sample;
+		num_samples = tile.num_samples;
+		sample = tile.sample;
+		resolution = tile.resolution;
+		offset = tile.offset;
+		stride = tile.stride;
+		buffer = tile.buffer;
+		rng_state = tile.rng_state;
+		buffers = tile.buffers;
+	}
+
+	/* Split kernel is device global memory constrained;
+	 * hence split kernel cant render big tile size's in
+	 * one go. If the user sets a big tile size (big tile size
+	 * is a term relative to the available device global memory),
+	 * we split the tile further and then call path_trace on
+	 * each of those split tiles. The following variables declared,
+	 * assist in achieving that purpose
+	 */
+	int buffer_offset_x;
+	int buffer_offset_y;
+	int rng_state_offset_x;
+	int rng_state_offset_y;
+	int buffer_rng_state_stride;
+};
+
+/* OpenCLDeviceSplitKernel's declaration/definition. */
+class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
+{
+public:
+	/* Kernel declaration. */
+	cl_kernel ckPathTraceKernel_data_init;
+	cl_kernel ckPathTraceKernel_scene_intersect;
+	cl_kernel ckPathTraceKernel_lamp_emission;
+	cl_kernel ckPathTraceKernel_queue_enqueue;
+	cl_kernel ckPathTraceKernel_background_buffer_update;
+	cl_kernel ckPathTraceKernel_shader_eval;
+	cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
+	cl_kernel ckPathTraceKernel_direct_lighting;
+	cl_kernel ckPathTraceKernel_shadow_blocked;
+	cl_kernel ckPathTraceKernel_next_iteration_setup;
+	cl_kernel ckPathTraceKernel_sum_all_radiance;
+
+	/* cl_program declaration. */
+	cl_program data_init_program;
+	cl_program scene_intersect_program;
+	cl_program lamp_emission_program;
+	cl_program queue_enqueue_program;
+	cl_program background_buffer_update_program;
+	cl_program shader_eval_program;
+	cl_program holdout_emission_blurring_pathtermination_ao_program;
+	cl_program direct_lighting_program;
+	cl_program shadow_blocked_program;
+	cl_program next_iteration_setup_program;
+	cl_program sum_all_radiance_program;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	cl_mem rng_coop;
+	cl_mem throughput_coop;
+	cl_mem L_transparent_coop;
+	cl_mem PathRadiance_coop;
+	cl_mem Ray_coop;
+	cl_mem PathState_coop;
+	cl_mem Intersection_coop;
+	cl_mem kgbuffer;  /* KernelGlobals buffer. */
+
+	/* Global buffers for ShaderData. */
+	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
+	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
+	                        * shadow_blocked kernel.
+	                        */
+
+	/* Global buffers of each member of ShaderData. */
+	cl_mem P_sd;
+	cl_mem P_sd_DL_shadow;
+	cl_mem N_sd;
+	cl_mem N_sd_DL_shadow;
+	cl_mem Ng_sd;
+	cl_mem Ng_sd_DL_shadow;
+	cl_mem I_sd;
+	cl_mem I_sd_DL_shadow;
+	cl_mem shader_sd;
+	cl_mem shader_sd_DL_shadow;
+	cl_mem flag_sd;
+	cl_mem flag_sd_DL_shadow;
+	cl_mem prim_sd;
+	cl_mem prim_sd_DL_shadow;
+	cl_mem type_sd;
+	cl_mem type_sd_DL_shadow;
+	cl_mem u_sd;
+	cl_mem u_sd_DL_shadow;
+	cl_mem v_sd;
+	cl_mem v_sd_DL_shadow;
+	cl_mem object_sd;
+	cl_mem object_sd_DL_shadow;
+	cl_mem time_sd;
+	cl_mem time_sd_DL_shadow;
+	cl_mem ray_length_sd;
+	cl_mem ray_length_sd_DL_shadow;
+	cl_mem ray_depth_sd;
+	cl_mem ray_depth_sd_DL_shadow;
+	cl_mem transparent_depth_sd;
+	cl_mem transparent_depth_sd_DL_shadow;
+
+	/* Ray differentials. */
+	cl_mem dP_sd, dI_sd;
+	cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
+	cl_mem du_sd, dv_sd;
+	cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
+
+	/* Dp/Du */
+	cl_mem dPdu_sd, dPdv_sd;
+	cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
+
+	/* Object motion. */
+	cl_mem ob_tfm_sd, ob_itfm_sd;
+	cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
+
+	cl_mem closure_sd;
+	cl_mem closure_sd_DL_shadow;
+	cl_mem num_closure_sd;
+	cl_mem num_closure_sd_DL_shadow;
+	cl_mem randb_closure_sd;
+	cl_mem randb_closure_sd_DL_shadow;
+	cl_mem ray_P_sd;
+	cl_mem ray_P_sd_DL_shadow;
+	cl_mem ray_dP_sd;
+	cl_mem ray_dP_sd_DL_shadow;
+
+	/* Global memory required for shadow blocked and accum_radiance. */
+	cl_mem BSDFEval_coop;
+	cl_mem ISLamp_coop;
+	cl_mem LightRay_coop;
+	cl_mem AOAlpha_coop;
+	cl_mem AOBSDF_coop;
+	cl_mem AOLightRay_coop;
+	cl_mem Intersection_coop_AO;
+	cl_mem Intersection_coop_DL;
+
+#ifdef WITH_CYCLES_DEBUG
+	/* DebugData memory */
+	cl_mem debugdata_coop;
+#endif
+
+	/* Global state array that tracks ray state. */
+	cl_mem ray_state;
+
+	/* Per sample buffers. */
+	cl_mem per_sample_output_buffers;
+
+	/* Denotes which sample each ray is being processed for. */
+	cl_mem work_array;
+
+	/* Queue */
+	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
+	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
+	                     * Tracks the size of each queue.
+	                     */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	cl_mem use_queues_flag;
+
+	/* Amount of memory in output buffer associated with one pixel/thread. */
+	size_t per_thread_output_buffer_size;
+
+	/* Total allocatable available device memory. */
+	size_t total_allocatable_memory;
+
+	/* host version of ray_state; Used in checking host path-iteration
+	 * termination.
+	 */
+	char *hostRayStateArray;
+
+	/* Number of path-iterations to be done in one shot. */
+	unsigned int PathIteration_times;
+
+#ifdef __WORK_STEALING__
+	/* Work pool with respect to each work group. */
+	cl_mem work_pool_wgs;
+
+	/* Denotes the maximum work groups possible w.r.t. current tile size. */
+	unsigned int max_work_groups;
+#endif
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_)
+	{
+		background = background_;
+
+		/* Initialize kernels. */
+		ckPathTraceKernel_data_init = NULL;
+		ckPathTraceKernel_scene_intersect = NULL;
+		ckPathTraceKernel_lamp_emission = NULL;
+		ckPathTraceKernel_background_buffer_update = NULL;
+		ckPathTraceKernel_shader_eval = NULL;
+		ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
+		ckPathTraceKernel_direct_lighting = NULL;
+		ckPathTraceKernel_shadow_blocked = NULL;
+		ckPathTraceKernel_next_iteration_setup = NULL;
+		ckPathTraceKernel_sum_all_radiance = NULL;
+		ckPathTraceKernel_queue_enqueue = NULL;
+
+		/* Initialize program. */
+		data_init_program = NULL;
+		scene_intersect_program = NULL;
+		lamp_emission_program = NULL;
+		queue_enqueue_program = NULL;
+		background_buffer_update_program = NULL;
+		shader_eval_program = NULL;
+		holdout_emission_blurring_pathtermination_ao_program = NULL;
+		direct_lighting_program = NULL;
+		shadow_blocked_program = NULL;
+		next_iteration_setup_program = NULL;
+		sum_all_radiance_program = NULL;
+
+		/* Initialize cl_mem variables. */
+		kgbuffer = NULL;
+		sd = NULL;
+		sd_DL_shadow = NULL;
+
+		P_sd = NULL;
+		P_sd_DL_shadow = NULL;
+		N_sd = NULL;
+		N_sd_DL_shadow = NULL;
+		Ng_sd = NULL;
+		Ng_sd_DL_shadow = NULL;
+		I_sd = NULL;
+		I_sd_DL_shadow = NULL;
+		shader_sd = NULL;
+		shader_sd_DL_shadow = NULL;
+		flag_sd = NULL;
+		flag_sd_DL_shadow = NULL;
+		prim_sd = NULL;
+		prim_sd_DL_shadow = NULL;
+		type_sd = NULL;
+		type_sd_DL_shadow = NULL;
+		u_sd = NULL;
+		u_sd_DL_shadow = NULL;
+		v_sd = NULL;
+		v_sd_DL_shadow = NULL;
+		object_sd = NULL;
+		object_sd_DL_shadow = NULL;
+		time_sd = NULL;
+		time_sd_DL_shadow = NULL;
+		ray_length_sd = NULL;
+		ray_length_sd_DL_shadow = NULL;
+		ray_depth_sd = NULL;
+		ray_depth_sd_DL_shadow = NULL;
+		transparent_depth_sd = NULL;
+		transparent_depth_sd_DL_shadow = NULL;
+
+		/* Ray differentials. */
+		dP_sd = NULL;
+		dI_sd = NULL;
+		dP_sd_DL_shadow = NULL;
+		dI_sd_DL_shadow = NULL;
+		du_sd = NULL;
+		dv_sd = NULL;
+		du_sd_DL_shadow = NULL;
+		dv_sd_DL_shadow = NULL;
+
+		/* Dp/Du */
+		dPdu_sd = NULL;
+		dPdv_sd = NULL;
+		dPdu_sd_DL_shadow = NULL;
+		dPdv_sd_DL_shadow = NULL;
+
+		/* Object motion. */
+		ob_tfm_sd = NULL;
+		ob_itfm_sd = NULL;
+		ob_tfm_sd_DL_shadow = NULL;
+		ob_itfm_sd_DL_shadow = NULL;
+
+		closure_sd = NULL;
+		closure_sd_DL_shadow = NULL;
+		num_closure_sd = NULL;
+		num_closure_sd_DL_shadow = NULL;
+		randb_closure_sd = NULL;
+		randb_closure_sd_DL_shadow = NULL;
+		ray_P_sd = NULL;
+		ray_P_sd_DL_shadow = NULL;
+		ray_dP_sd = NULL;
+		ray_dP_sd_DL_shadow = NULL;
+
+		rng_coop = NULL;
+		throughput_coop = NULL;
+		L_transparent_coop = NULL;
+		PathRadiance_coop = NULL;
+		Ray_coop = NULL;
+		PathState_coop = NULL;
+		Intersection_coop = NULL;
+		ray_state = NULL;
+
+		AOAlpha_coop = NULL;
+		AOBSDF_coop = NULL;
+		AOLightRay_coop = NULL;
+		BSDFEval_coop = NULL;
+		ISLamp_coop = NULL;
+		LightRay_coop = NULL;
+		Intersection_coop_AO = NULL;
+		Intersection_coop_DL = NULL;
+
+#ifdef WITH_CYCLES_DEBUG
+		debugdata_coop = NULL;
+#endif
+
+		work_array = NULL;
+
+		/* Queue. */
+		Queue_data = NULL;
+		Queue_index = NULL;
+		use_queues_flag = NULL;
+
+		per_sample_output_buffers = NULL;
+
+		per_thread_output_buffer_size = 0;
+		hostRayStateArray = NULL;
+		PathIteration_times = PATH_ITER_INC_FACTOR;
+#ifdef __WORK_STEALING__
+		work_pool_wgs = NULL;
+		max_work_groups = 0;
+#endif
+		current_max_closure = -1;
+		first_tile = true;
+
+		/* Get device's maximum memory that can be allocated. */
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+		                        sizeof(size_t),
+		                        &total_allocatable_memory,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(platform_name == "AMD Accelerated Parallel Processing") {
+			/* This value is tweak-able; AMD platform does not seem to
+			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
+			 * is considered for further computation.
+			 */
+			total_allocatable_memory /= 2;
+		}
+	}
+
+	/* TODO(sergey): Seems really close to load_kernel(),
+	 * could it be de-duplicated?
+	 */
+	bool load_split_kernel(string kernel_path,
+	                       string kernel_init_source,
+	                       string clbin,
+	                       string custom_kernel_build_options,
+	                       cl_program *program,
+	                       const string *debug_src = NULL)
+	{
+		if(!opencl_version_check())
+			return false;
+
+		clbin = path_user_get(path_join("cache", clbin));
+
+		/* If exists already, try use it. */
+		if(path_exists(clbin) && load_binary(kernel_path,
+		                                     clbin,
+		                                     custom_kernel_build_options,
+		                                     program,
+		                                     debug_src)) {
+			/* Kernel loaded from binary. */
+		}
+		else {
+			/* If does not exist or loading binary failed, compile kernel. */
+			if(!compile_kernel(kernel_path,
+			                   kernel_init_source,
+			                   custom_kernel_build_options,
+			                   program,
+			                   debug_src))
+			{
+				return false;
+			}
+			/* Save binary for reuse. */
+			if(!save_binary(program, clbin)) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/* Split kernel utility functions. */
+	size_t get_tex_size(const char *tex_name)
+	{
+		cl_mem ptr;
+		size_t ret_size = 0;
+		MemMap::iterator i = mem_map.find(tex_name);
+		if(i != mem_map.end()) {
+			ptr = CL_MEM_PTR(i->second);
+			ciErr = clGetMemObjectInfo(ptr,
+			                           CL_MEM_SIZE,
+			                           sizeof(ret_size),
+			                           &ret_size,
+			                           NULL);
+			assert(ciErr == CL_SUCCESS);
+		}
+		return ret_size;
+	}
+
+	size_t get_shader_closure_size(int max_closure)
+	{
+		return (sizeof(ShaderClosure) * max_closure);
+	}
+
+	size_t get_shader_data_size(size_t shader_closure_size)
+	{
+		/* ShaderData size without accounting for ShaderClosure array. */
+		size_t shader_data_size =
+			sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
+		return (shader_data_size + shader_closure_size);
+	}
+
+	/* Returns size of KernelGlobals structure associated with OpenCL. */
+	size_t get_KernelGlobals_size()
+	{
+		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+		 * fetch its size.
+		 */
+		typedef struct KernelGlobals {
+			ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name;
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+		} KernelGlobals;
+
+		return sizeof(KernelGlobals);
+	}
+
+	/* Returns size of Structure of arrays implementation of. */
+	size_t get_shaderdata_soa_size()
+	{
+		size_t shader_soa_size = 0;
+
+#define SD_VAR(type, what) shader_soa_size += sizeof(void *);
+#define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
+		#include "kernel_shaderdata_vars.h"
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
+
+		return shader_soa_size;
+	}
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features)
+	{
+		/* Get Shader, bake and film_convert kernels.
+		 * It'll also do verification of OpenCL actually initialized.
+		 */
+		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+			return false;
+		}
+
+		string kernel_path = path_get("kernel");
+		string kernel_md5 = path_files_md5_hash(kernel_path);
+		string device_md5;
+		string kernel_init_source;
+		string clbin;
+		string clsrc, *debug_src = NULL;
+
+		string build_options = "-D__SPLIT_KERNEL__";
+#ifdef __WORK_STEALING__
+		build_options += " -D__WORK_STEALING__";
+#endif
+		build_options += build_options_from_requested_features(requested_features);
+
+		/* Set compute device build option. */
+		cl_device_type device_type;
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_TYPE,
+		                        sizeof(cl_device_type),
+		                        &device_type,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(device_type == CL_DEVICE_TYPE_GPU) {
+			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		}
+
+#define GLUE(a, b) a ## b
+#define LOAD_KERNEL(name) \
+	do { \
+		kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
+		                     kernel_md5 + "\n"; \
+		device_md5 = device_md5_hash(build_options); \
+		clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
+		                      device_md5.c_str(), kernel_md5.c_str()); \
+		if(opencl_kernel_use_debug()) { \
+			clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
+			                      device_md5.c_str(), kernel_md5.c_str()); \
+			clsrc = path_user_get(path_join("cache", clsrc)); \
+			debug_src = &clsrc; \
+		} \
+		if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
+		                      build_options, \
+		                      &GLUE(name, _program), \
+		                      debug_src)) \
+		{ \
+			fprintf(stderr, "Faled to compile %s\n", #name); \
+			return false; \
+		} \
+	} while(false)
+
+		LOAD_KERNEL(data_init);
+		LOAD_KERNEL(scene_intersect);
+		LOAD_KERNEL(lamp_emission);
+		LOAD_KERNEL(queue_enqueue);
+		LOAD_KERNEL(background_buffer_update);
+		LOAD_KERNEL(shader_eval);
+		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		LOAD_KERNEL(direct_lighting);
+		LOAD_KERNEL(shadow_blocked);
+		LOAD_KERNEL(next_iteration_setup);
+		LOAD_KERNEL(sum_all_radiance);
+
+#undef LOAD_KERNEL
+
+#define FIND_KERNEL(name) \
+	do { \
+		GLUE(ckPathTraceKernel_, name) = \
+			clCreateKernel(GLUE(name, _program), \
+			               "kernel_ocl_path_trace_"  #name, &ciErr); \
+		if(opencl_error(ciErr)) { \
+			fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
+			return false; \
+		} \
+	} while(false)
+
+		FIND_KERNEL(data_init);
+		FIND_KERNEL(scene_intersect);
+		FIND_KERNEL(lamp_emission);
+		FIND_KERNEL(queue_enqueue);
+		FIND_KERNEL(background_buffer_update);
+		FIND_KERNEL(shader_eval);
+		FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		FIND_KERNEL(direct_lighting);
+		FIND_KERNEL(shadow_blocked);
+		FIND_KERNEL(next_iteration_setup);
+		FIND_KERNEL(sum_all_radiance);
+#undef FIND_KERNEL
+#undef GLUE
+
+		current_max_closure = requested_features.max_closure;
+
+		return true;
+	}
+
+	~OpenCLDeviceSplitKernel()
+	{
+		task_pool.stop();
+
+		/* Release kernels */
+		release_kernel_safe(ckPathTraceKernel_data_init);
+		release_kernel_safe(ckPathTraceKernel_scene_intersect);
+		release_kernel_safe(ckPathTraceKernel_lamp_emission);
+		release_kernel_safe(ckPathTraceKernel_queue_enqueue);
+		release_kernel_safe(ckPathTraceKernel_background_buffer_update);
+		release_kernel_safe(ckPathTraceKernel_shader_eval);
+		release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
+		release_kernel_safe(ckPathTraceKernel_direct_lighting);
+		release_kernel_safe(ckPathTraceKernel_shadow_blocked);
+		release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
+		release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
+
+		/* Release global memory */
+		release_mem_object_safe(P_sd);
+		release_mem_object_safe(P_sd_DL_shadow);
+		release_mem_object_safe(N_sd);
+		release_mem_object_safe(N_sd_DL_shadow);
+		release_mem_object_safe(Ng_sd);
+		release_mem_object_safe(Ng_sd_DL_shadow);
+		release_mem_object_safe(I_sd);
+		release_mem_object_safe(I_sd_DL_shadow);
+		release_mem_object_safe(shader_sd);
+		release_mem_object_safe(shader_sd_DL_shadow);
+		release_mem_object_safe(flag_sd);
+		release_mem_object_safe(flag_sd_DL_shadow);
+		release_mem_object_safe(prim_sd);
+		release_mem_object_safe(prim_sd_DL_shadow);
+		release_mem_object_safe(type_sd);
+		release_mem_object_safe(type_sd_DL_shadow);
+		release_mem_object_safe(u_sd);
+		release_mem_object_safe(u_sd_DL_shadow);
+		release_mem_object_safe(v_sd);
+		release_mem_object_safe(v_sd_DL_shadow);
+		release_mem_object_safe(object_sd);
+		release_mem_object_safe(object_sd_DL_shadow);
+		release_mem_object_safe(time_sd);
+		release_mem_object_safe(time_sd_DL_shadow);
+		release_mem_object_safe(ray_length_sd);
+		release_mem_object_safe(ray_length_sd_DL_shadow);
+		release_mem_object_safe(ray_depth_sd);
+		release_mem_object_safe(ray_depth_sd_DL_shadow);
+		release_mem_object_safe(transparent_depth_sd);
+		release_mem_object_safe(transparent_depth_sd_DL_shadow);
+
+		/* Ray differentials. */
+		release_mem_object_safe(dP_sd);
+		release_mem_object_safe(dP_sd_DL_shadow);
+		release_mem_object_safe(dI_sd);
+		release_mem_object_safe(dI_sd_DL_shadow);
+		release_mem_object_safe(du_sd);
+		release_mem_object_safe(du_sd_DL_shadow);
+		release_mem_object_safe(dv_sd);
+		release_mem_object_safe(dv_sd_DL_shadow);
+
+		/* Dp/Du */
+		release_mem_object_safe(dPdu_sd);
+		release_mem_object_safe(dPdu_sd_DL_shadow);
+		release_mem_object_safe(dPdv_sd);
+		release_mem_object_safe(dPdv_sd_DL_shadow);
+
+		/* Object motion. */
+		release_mem_object_safe(ob_tfm_sd);
+		release_mem_object_safe(ob_itfm_sd);
+
+		release_mem_object_safe(ob_tfm_sd_DL_shadow);
+		release_mem_object_safe(ob_itfm_sd_DL_shadow);
+
+		release_mem_object_safe(closure_sd);
+		release_mem_object_safe(closure_sd_DL_shadow);
+		release_mem_object_safe(num_closure_sd);
+		release_mem_object_safe(num_closure_sd_DL_shadow);
+		release_mem_object_safe(randb_closure_sd);
+		release_mem_object_safe(randb_closure_sd_DL_shadow);
+		release_mem_object_safe(ray_P_sd);
+		release_mem_object_safe(ray_P_sd_DL_shadow);
+		release_mem_object_safe(ray_dP_sd);
+		release_mem_object_safe(ray_dP_sd_DL_shadow);
+		release_mem_object_safe(rng_coop);
+		release_mem_object_safe(throughput_coop);
+		release_mem_object_safe(L_transparent_coop);
+		release_mem_object_safe(PathRadiance_coop);
+		release_mem_object_safe(Ray_coop);
+		release_mem_object_safe(PathState_coop);
+		release_mem_object_safe(Intersection_coop);
+		release_mem_object_safe(kgbuffer);
+		release_mem_object_safe(sd);
+		release_mem_object_safe(sd_DL_shadow);
+		release_mem_object_safe(ray_state);
+		release_mem_object_safe(AOAlpha_coop);
+		release_mem_object_safe(AOBSDF_coop);
+		release_mem_object_safe(AOLightRay_coop);
+		release_mem_object_safe(BSDFEval_coop);
+		release_mem_object_safe(ISLamp_coop);
+		release_mem_object_safe(LightRay_coop);
+		release_mem_object_safe(Intersection_coop_AO);
+		release_mem_object_safe(Intersection_coop_DL);
+#ifdef WITH_CYCLES_DEBUG
+		release_mem_object_safe(debugdata_coop);
+#endif
+		release_mem_object_safe(use_queues_flag);
+		release_mem_object_safe(Queue_data);
+		release_mem_object_safe(Queue_index);
+		release_mem_object_safe(work_array);
+#ifdef __WORK_STEALING__
+		release_mem_object_safe(work_pool_wgs);
+#endif
+		release_mem_object_safe(per_sample_output_buffers);
+
+		/* Release programs */
+		release_program_safe(data_init_program);
+		release_program_safe(scene_intersect_program);
+		release_program_safe(lamp_emission_program);
+		release_program_safe(queue_enqueue_program);
+		release_program_safe(background_buffer_update_program);
+		release_program_safe(shader_eval_program);
+		release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
+		release_program_safe(direct_lighting_program);
+		release_program_safe(shadow_blocked_program);
+		release_program_safe(next_iteration_setup_program);
+		release_program_safe(sum_all_radiance_program);
+
+		if(hostRayStateArray != NULL) {
+			free(hostRayStateArray);
+		}
+	}
+
+	void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
+	{
+		/* cast arguments to cl types */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Make sure that set render feasible tile size is a multiple of local
+		 * work size dimensions.
+		 */
+		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
+		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+
+		size_t global_size[2];
+		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
+		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+
+		/* Set the range of samples to be processed for every ray in
+		 * path-regeneration logic.
+		 */
+		cl_int start_sample = rtile.start_sample;
+		cl_int end_sample = rtile.start_sample + rtile.num_samples;
+		cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_parallel_samples = 1;
+#else
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_threads = max_render_feasible_tile_size.x *
+		                           max_render_feasible_tile_size.y;
+		unsigned int num_tile_columns_possible = num_threads / global_size[1];
+		/* Estimate number of parallel samples that can be
+		 * processed in parallel.
+		 */
+		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
+		                                        rtile.num_samples);
+		/* Wavefront size in AMD is 64.
+		 * TODO(sergey): What about other platforms?
+		 */
+		if(num_parallel_samples >= 64) {
+			/* TODO(sergey): Could use generic round-up here. */
+			num_parallel_samples = (num_parallel_samples / 64) * 64;
+		}
+		assert(num_parallel_samples != 0);
+
+		global_size[0] = d_w * num_parallel_samples;
+#endif  /* __WORK_STEALING__ */
+
+		assert(global_size[0] * global_size[1] <=
+		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
+
+		/* Allocate all required global memory once. */
+		if(first_tile) {
+			size_t num_global_elements = max_render_feasible_tile_size.x *
+			                             max_render_feasible_tile_size.y;
+			/* TODO(sergey): This will actually over-allocate if
+			 * particular kernel does not support multiclosure.
+			 */
+			size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
+
+#ifdef __WORK_STEALING__
+			/* Calculate max groups */
+			size_t max_global_size[2];
+			size_t tile_x = max_render_feasible_tile_size.x;
+			size_t tile_y = max_render_feasible_tile_size.y;
+			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
+			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
+			max_work_groups = (max_global_size[0] * max_global_size[1]) /
+			                  (local_size[0] * local_size[1]);
+			/* Allocate work_pool_wgs memory. */
+			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+#endif  /* __WORK_STEALING__ */
+
+			/* Allocate queue_index memory only once. */
+			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
+			use_queues_flag = mem_alloc(sizeof(char));
+			kgbuffer = mem_alloc(get_KernelGlobals_size());
+
+			/* Create global buffers for ShaderData. */
+			sd = mem_alloc(get_shaderdata_soa_size());
+			sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
+			P_sd = mem_alloc(num_global_elements * sizeof(float3));
+			P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			N_sd = mem_alloc(num_global_elements * sizeof(float3));
+			N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
+			Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			I_sd = mem_alloc(num_global_elements * sizeof(float3));
+			I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			shader_sd = mem_alloc(num_global_elements * sizeof(int));
+			shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			flag_sd = mem_alloc(num_global_elements * sizeof(int));
+			flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			prim_sd = mem_alloc(num_global_elements * sizeof(int));
+			prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			type_sd = mem_alloc(num_global_elements * sizeof(int));
+			type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			u_sd = mem_alloc(num_global_elements * sizeof(float));
+			u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			v_sd = mem_alloc(num_global_elements * sizeof(float));
+			v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			object_sd = mem_alloc(num_global_elements * sizeof(int));
+			object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			time_sd = mem_alloc(num_global_elements * sizeof(float));
+			time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
+			ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+			ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+			transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+
+			/* Ray differentials. */
+			dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+			dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+			du_sd = mem_alloc(num_global_elements * sizeof(differential));
+			du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+			dv_sd = mem_alloc(num_global_elements * sizeof(differential));
+			dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+
+			/* Dp/Du */
+			dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
+			dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
+			dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+
+			/* Object motion. */
+			ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
+			ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
+			ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
+			ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
+
+			closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
+			closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
+			num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
+			num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+			randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
+			randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+			ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
+			ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+			ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+			ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+
+			/* Creation of global memory buffers which are shared among
+			 * the kernels.
+			 */
+			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
+			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
+			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
+			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
+			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
+			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
+			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
+			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
+			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
+			Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
+
+#ifdef WITH_CYCLES_DEBUG
+			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
+#endif
+
+			ray_state = mem_alloc(num_global_elements * sizeof(char));
+
+			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
+			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+
+			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
+			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
+			per_sample_output_buffers = mem_alloc(num_global_elements *
+			                                      per_thread_output_buffer_size);
+		}
+
+		cl_int dQueue_size = global_size[0] * global_size[1];
+		cl_int total_num_rays = global_size[0] * global_size[1];
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                0,
+			                kgbuffer,
+			                sd,
+			                sd_DL_shadow,
+			                P_sd,
+			                P_sd_DL_shadow,
+			                N_sd,
+			                N_sd_DL_shadow,
+			                Ng_sd,
+			                Ng_sd_DL_shadow,
+			                I_sd,
+			                I_sd_DL_shadow,
+			                shader_sd,
+			                shader_sd_DL_shadow,
+			                flag_sd,
+			                flag_sd_DL_shadow,
+			                prim_sd,
+			                prim_sd_DL_shadow,
+			                type_sd,
+			                type_sd_DL_shadow,
+			                u_sd,
+			                u_sd_DL_shadow,
+			                v_sd,
+			                v_sd_DL_shadow,
+			                object_sd,
+			                object_sd_DL_shadow,
+			                time_sd,
+			                time_sd_DL_shadow,
+			                ray_length_sd,
+			                ray_length_sd_DL_shadow,
+			                ray_depth_sd,
+			                ray_depth_sd_DL_shadow,
+			                transparent_depth_sd,
+			                transparent_depth_sd_DL_shadow);
+
+		/* Ray differentials. */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                dP_sd,
+			                dP_sd_DL_shadow,
+			                dI_sd,
+			                dI_sd_DL_shadow,
+			                du_sd,
+			                du_sd_DL_shadow,
+			                dv_sd,
+			                dv_sd_DL_shadow);
+
+		/* Dp/Du */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                dPdu_sd,
+			                dPdu_sd_DL_shadow,
+			                dPdv_sd,
+			                dPdv_sd_DL_shadow);
+
+		/* Object motion. */
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                ob_tfm_sd,
+			                ob_tfm_sd_DL_shadow,
+			                ob_itfm_sd,
+			                ob_itfm_sd_DL_shadow);
+
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                closure_sd,
+			                closure_sd_DL_shadow,
+			                num_closure_sd,
+			                num_closure_sd_DL_shadow,
+			                randb_closure_sd,
+			                randb_closure_sd_DL_shadow,
+			                ray_P_sd,
+			                ray_P_sd_DL_shadow,
+			                ray_dP_sd,
+			                ray_dP_sd_DL_shadow,
+			                d_data,
+			                per_sample_output_buffers,
+			                d_rng_state,
+			                rng_coop,
+			                throughput_coop,
+			                L_transparent_coop,
+			                PathRadiance_coop,
+			                Ray_coop,
+			                PathState_coop,
+			                ray_state);
+
+/* TODO(segrey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+	set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index +=
+			kernel_set_args(ckPathTraceKernel_data_init,
+			                start_arg_index,
+			                start_sample,
+			                d_x,
+			                d_y,
+			                d_w,
+			                d_h,
+			                d_offset,
+			                d_stride,
+			                rtile.rng_state_offset_x,
+			                rtile.rng_state_offset_y,
+			                rtile.buffer_rng_state_stride,
+			                Queue_data,
+			                Queue_index,
+			                dQueue_size,
+			                use_queues_flag,
+			                work_array,
+#ifdef __WORK_STEALING__
+			                work_pool_wgs,
+			                num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+			                debugdata_coop,
+#endif
+			                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_scene_intersect,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+#ifdef WITH_CYCLES_DEBUG
+		                debugdata_coop,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_lamp_emission,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_queue_enqueue,
+		                0,
+		                Queue_data,
+		                Queue_index,
+		                ray_state,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_background_buffer_update,
+		                 0,
+		                 kgbuffer,
+		                 d_data,
+		                 sd,
+		                 per_sample_output_buffers,
+		                 d_rng_state,
+		                 rng_coop,
+		                 throughput_coop,
+		                 PathRadiance_coop,
+		                 Ray_coop,
+		                 PathState_coop,
+		                 L_transparent_coop,
+		                 ray_state,
+		                 d_w,
+		                 d_h,
+		                 d_x,
+		                 d_y,
+		                 d_stride,
+		                 rtile.rng_state_offset_x,
+		                 rtile.rng_state_offset_y,
+		                 rtile.buffer_rng_state_stride,
+		                 work_array,
+		                 Queue_data,
+		                 Queue_index,
+		                 dQueue_size,
+		                 end_sample,
+		                 start_sample,
+#ifdef __WORK_STEALING__
+		                 work_pool_wgs,
+		                 num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+		                 debugdata_coop,
+#endif
+		                 num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_shader_eval,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                per_sample_output_buffers,
+		                rng_coop,
+		                throughput_coop,
+		                L_transparent_coop,
+		                PathRadiance_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                AOAlpha_coop,
+		                AOBSDF_coop,
+		                AOLightRay_coop,
+		                d_w,
+		                d_h,
+		                d_x,
+		                d_y,
+		                d_stride,
+		                ray_state,
+		                work_array,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+#ifdef __WORK_STEALING__
+		                start_sample,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(ckPathTraceKernel_direct_lighting,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                sd_DL_shadow,
+		                rng_coop,
+		                PathState_coop,
+		                ISLamp_coop,
+		                LightRay_coop,
+		                BSDFEval_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(ckPathTraceKernel_shadow_blocked,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd_DL_shadow,
+		                PathState_coop,
+		                LightRay_coop,
+		                AOLightRay_coop,
+		                Intersection_coop_AO,
+		                Intersection_coop_DL,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                total_num_rays);
+
+		kernel_set_args(ckPathTraceKernel_next_iteration_setup,
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                LightRay_coop,
+		                ISLamp_coop,
+		                BSDFEval_coop,
+		                AOLightRay_coop,
+		                AOBSDF_coop,
+		                AOAlpha_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag);
+
+		kernel_set_args(ckPathTraceKernel_sum_all_radiance,
+		                0,
+		                d_data,
+		                d_buffer,
+		                per_sample_output_buffers,
+		                num_parallel_samples,
+		                d_w,
+		                d_h,
+		                d_stride,
+		                rtile.buffer_offset_x,
+		                rtile.buffer_offset_y,
+		                rtile.buffer_rng_state_stride,
+		                start_sample);
+
+		/* Macro for Enqueuing split kernels. */
+#define GLUE(a, b) a ## b
+#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
+		opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, \
+		                                     GLUE(ckPathTraceKernel_, \
+		                                          kernelName), \
+		                                     2, \
+		                                     NULL, \
+		                                     globalSize, \
+		                                     localSize, \
+		                                     0, \
+		                                     NULL, \
+		                                     NULL))
+
+		/* Enqueue ckPathTraceKernel_data_init kernel. */
+		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
+		bool activeRaysAvailable = true;
+
+		/* Record number of time host intervention has been made */
+		unsigned int numHostIntervention = 0;
+		unsigned int numNextPathIterTimes = PathIteration_times;
+		while(activeRaysAvailable) {
+			/* Twice the global work size of other kernels for
+			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+			size_t global_size_shadow_blocked[2];
+			global_size_shadow_blocked[0] = global_size[0] * 2;
+			global_size_shadow_blocked[1] = global_size[1];
+
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+			}
+
+			/* Read ray-state into Host memory to decide if we should exit
+			 * path-iteration in host.
+			 */
+			ciErr = clEnqueueReadBuffer(cqCommandQueue,
+			                            ray_state,
+			                            CL_TRUE,
+			                            0,
+			                            global_size[0] * global_size[1] * sizeof(char),
+			                            hostRayStateArray,
+			                            0,
+			                            NULL,
+			                            NULL);
+			assert(ciErr == CL_SUCCESS);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0;
+			    rayStateIter < global_size[0] * global_size[1];
+			    ++rayStateIter)
+			{
+				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(activeRaysAvailable) {
+				numHostIntervention++;
+				PathIteration_times = PATH_ITER_INC_FACTOR;
+				/* Host intervention done before all rays become RAY_INACTIVE;
+				 * Set do more initial iterations for the next tile.
+				 */
+				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
+			}
+		}
+
+		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
+		 * per_sample_output_buffers into RenderTile's output buffer.
+		 */
+		size_t sum_all_radiance_local_size[2] = {16, 16};
+		size_t sum_all_radiance_global_size[2];
+		sum_all_radiance_global_size[0] =
+			(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
+			sum_all_radiance_local_size[0];
+		sum_all_radiance_global_size[1] =
+			(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
+			sum_all_radiance_local_size[1];
+		ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
+		                     sum_all_radiance_global_size,
+		                     sum_all_radiance_local_size);
+
+#undef ENQUEUE_SPLIT_KERNEL
+#undef GLUE
+
+		if(numHostIntervention == 0) {
+			/* This means that we are executing kernel more than required
+			 * Must avoid this for the next sample/tile.
+			 */
+			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
+			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+		}
+		else {
+			/* Number of path-iterations done for this tile is set as
+			 * Initial path-iteration times for the next tile
+			 */
+			PathIteration_times = numNextPathIterTimes;
+		}
+
+		first_tile = false;
+	}
+
+	/* Calculates the amount of memory that has to be always
+	 * allocated in order for the split kernel to function.
+	 * This memory is tile/scene-property invariant (meaning,
+	 * the value returned by this function does not depend
+	 * on the user set tile size or scene properties.
+	 */
+	size_t get_invariable_mem_allocated()
+	{
+		size_t total_invariable_mem_allocated = 0;
+		size_t KernelGlobals_size = 0;
+		size_t ShaderData_SOA_size = 0;
+
+		KernelGlobals_size = get_KernelGlobals_size();
+		ShaderData_SOA_size = get_shaderdata_soa_size();
+
+		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
+		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
+		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
+		total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */
+		total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */
+
+		return total_invariable_mem_allocated;
+	}
+
+	/* Calculate the memory that has-to-be/has-been allocated for
+	 * the split kernel to function.
+	 */
+	size_t get_tile_specific_mem_allocated(const int2 tile_size)
+	{
+		size_t tile_specific_mem_allocated = 0;
+
+		/* Get required tile info */
+		unsigned int user_set_tile_w = tile_size.x;
+		unsigned int user_set_tile_h = tile_size.y;
+
+#ifdef __WORK_STEALING__
+		/* Calculate memory to be allocated for work_pools in
+		 * case of work_stealing.
+		 */
+		size_t max_global_size[2];
+		size_t max_num_work_pools = 0;
+		max_global_size[0] =
+			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		max_global_size[1] =
+			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		max_num_work_pools =
+			(max_global_size[0] * max_global_size[1]) /
+			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
+		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
+#endif
+
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * sizeof(RNG);
+
+		return tile_specific_mem_allocated;
+	}
+
+	/* Calculates the texture memories and KernelData (d_data) memory
+	 * that has been allocated.
+	 */
+	size_t get_scene_specific_mem_allocated(cl_mem d_data)
+	{
+		size_t scene_specific_mem_allocated = 0;
+		/* Calculate texture memories. */
+#define KERNEL_TEX(type, ttype, name) \
+	scene_specific_mem_allocated += get_tex_size(#name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+		size_t d_data_size;
+		ciErr = clGetMemObjectInfo(d_data,
+		                           CL_MEM_SIZE,
+		                           sizeof(d_data_size),
+		                           &d_data_size,
+		                           NULL);
+		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
+		scene_specific_mem_allocated += d_data_size;
+		return scene_specific_mem_allocated;
+	}
+
+	/* Calculate the memory required for one thread in split kernel. */
+	size_t get_per_thread_memory()
+	{
+		size_t shader_closure_size = 0;
+		size_t shaderdata_volume = 0;
+		shader_closure_size = get_shader_closure_size(current_max_closure);
+		/* TODO(sergey): This will actually over-allocate if
+		 * particular kernel does not support multiclosure.
+		 */
+		shaderdata_volume = get_shader_data_size(shader_closure_size);
+		size_t retval = sizeof(RNG)
+			+ sizeof(float3)          /* Throughput size */
+			+ sizeof(float)           /* L transparent size */
+			+ sizeof(char)            /* Ray state size */
+			+ sizeof(unsigned int)    /* Work element size */
+			+ sizeof(int)             /* ISLamp_size */
+			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+			+ sizeof(Intersection)    /* Overall isect */
+			+ sizeof(Intersection)    /* Instersection_coop_AO */
+			+ sizeof(Intersection)    /* Intersection coop DL */
+			+ shaderdata_volume       /* Overall ShaderData */
+			+ (shaderdata_volume * 2) /* ShaderData : DL and shadow */
+			+ sizeof(Ray) + sizeof(BsdfEval)
+			+ sizeof(float3)          /* AOAlpha size */
+			+ sizeof(float3)          /* AOBSDF size */
+			+ sizeof(Ray)
+			+ (sizeof(int) * NUM_QUEUES)
+			+ per_thread_output_buffer_size;
+		return retval;
+	}
+
+	/* Considers the total memory available in the device and
+	 * and returns the maximum global work size possible.
+	 */
+	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
+	{
+		/* Calculate invariably allocated memory. */
+		size_t invariable_mem_allocated = get_invariable_mem_allocated();
+		/* Calculate tile specific allocated memory. */
+		size_t tile_specific_mem_allocated =
+			get_tile_specific_mem_allocated(tile_size);
+		/* Calculate scene specific allocated memory. */
+		size_t scene_specific_mem_allocated =
+			get_scene_specific_mem_allocated(d_data);
+		/* Calculate total memory available for the threads in global work size. */
+		size_t available_memory = total_allocatable_memory
+			- invariable_mem_allocated
+			- tile_specific_mem_allocated
+			- scene_specific_mem_allocated
+			- DATA_ALLOCATION_MEM_FACTOR;
+		size_t per_thread_memory_required = get_per_thread_memory();
+		return (available_memory / per_thread_memory_required);
+	}
+
+	/* Checks if the device has enough memory to render the whole tile;
+	 * If not, we should split single tile into multiple tiles of small size
+	 * and process them all.
+	 */
+	bool need_to_split_tile(unsigned int d_w,
+	                        unsigned int d_h,
+	                        int2 max_render_feasible_tile_size)
+	{
+		size_t global_size_estimate[2];
+		/* TODO(sergey): Such round-ups are in quite few places, need to replace
+		 * them with an utility macro.
+		 */
+		global_size_estimate[0] =
+			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		global_size_estimate[1] =
+			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if((global_size_estimate[0] * global_size_estimate[1]) >
+		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
 		{
-			run = function_bind(&OpenCLDevice::thread_run, device, this);
+			return true;
 		}
-	};
+		else {
+			return false;
+		}
+	}
 
-	int get_split_task_count(DeviceTask& task)
+	/* Considers the scene properties, global memory available in the device
+	 * and returns a rectanglular tile dimension (approx the maximum)
+	 * that should render on split kernel.
+	 */
+	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
 	{
-		return 1;
+		int2 max_render_feasible_tile_size;
+		int square_root_val = (int)sqrt(feasible_global_work_size);
+		max_render_feasible_tile_size.x = square_root_val;
+		max_render_feasible_tile_size.y = square_root_val;
+		/* Ciel round-off max_render_feasible_tile_size. */
+		int2 ceil_render_feasible_tile_size;
+		ceil_render_feasible_tile_size.x =
+			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		ceil_render_feasible_tile_size.y =
+			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
+		   feasible_global_work_size)
+		{
+			return ceil_render_feasible_tile_size;
+		}
+		/* Floor round-off max_render_feasible_tile_size. */
+		int2 floor_render_feasible_tile_size;
+		floor_render_feasible_tile_size.x =
+			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		floor_render_feasible_tile_size.y =
+			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		return floor_render_feasible_tile_size;
 	}
 
-	void task_add(DeviceTask& task)
+	/* Try splitting the current tile into multiple smaller
+	 * almost-square-tiles.
+	 */
+	int2 get_split_tile_size(RenderTile rtile,
+	                         int2 max_render_feasible_tile_size)
 	{
-		task_pool.push(new OpenCLDeviceTask(this, task));
+		int2 split_tile_size;
+		int num_global_threads = max_render_feasible_tile_size.x *
+		                         max_render_feasible_tile_size.y;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		/* Ceil round off d_w and d_h */
+		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		while(d_w * d_h > num_global_threads) {
+			/* Halve the longer dimension. */
+			if(d_w >= d_h) {
+				d_w = d_w / 2;
+				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_X;
+			}
+			else {
+				d_h = d_h / 2;
+				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_Y;
+			}
+		}
+		split_tile_size.x = d_w;
+		split_tile_size.y = d_h;
+		return split_tile_size;
 	}
 
-	void task_wait()
+	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
+	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
 	{
-		task_pool.wait();
+		vector<SplitRenderTile> to_path_trace_rtile;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
+		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
+		/* Buffer and rng_state offset calc. */
+		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
+		size_t offset_x = offset_index % rtile.stride;
+		size_t offset_y = offset_index / rtile.stride;
+		/* Resize to_path_trace_rtile. */
+		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
+		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
+			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
+				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
+				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
+				to_path_trace_rtile[rtile_index].sample = rtile.sample;
+				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
+				to_path_trace_rtile[rtile_index].offset = rtile.offset;
+				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
+				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
+				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
+				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
+				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
+				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
+				/* Fill width and height of the new render tile. */
+				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
+					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
+					: split_tile_size.x;
+				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
+					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
+					: split_tile_size.y;
+				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
+			}
+		}
+		return to_path_trace_rtile;
 	}
 
-	void task_cancel()
+	void thread_run(DeviceTask *task)
 	{
-		task_pool.cancel();
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+			bool initialize_data_and_check_render_feasibility = false;
+			bool need_to_split_tiles_further = false;
+			int2 max_render_feasible_tile_size;
+			size_t feasible_global_work_size;
+			const int2 tile_size = task->requested_tile_size;
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				if(!initialize_data_and_check_render_feasibility) {
+					/* Initialize data. */
+					/* Calculate per_thread_output_buffer_size. */
+					size_t output_buffer_size = 0;
+					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
+					                           CL_MEM_SIZE,
+					                           sizeof(output_buffer_size),
+					                           &output_buffer_size,
+					                           NULL);
+					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
+					/* This value is different when running on AMD and NV. */
+					if(background) {
+						/* In offline render the number of buffer elements
+						 * associated with tile.buffer is the current tile size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.w * tile.h);
+					}
+					else {
+						/* interactive rendering, unlike offline render, the number of buffer elements
+						 * associated with tile.buffer is the entire viewport size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.buffers->params.width *
+							                      tile.buffers->params.height);
+					}
+					/* Check render feasibility. */
+					feasible_global_work_size = get_feasible_global_work_size(
+						tile_size,
+						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
+					max_render_feasible_tile_size =
+						get_max_render_feasible_tile_size(
+							feasible_global_work_size);
+					need_to_split_tiles_further =
+						need_to_split_tile(tile_size.x,
+						                   tile_size.y,
+						                   max_render_feasible_tile_size);
+					initialize_data_and_check_render_feasibility = true;
+				}
+				if(need_to_split_tiles_further) {
+					int2 split_tile_size =
+						get_split_tile_size(tile,
+						                    max_render_feasible_tile_size);
+					vector<SplitRenderTile> to_path_trace_render_tiles =
+						split_tiles(tile, split_tile_size);
+					/* Print message to console */
+					if(background && (to_path_trace_render_tiles.size() > 1)) {
+						fprintf(stderr, "Message : Tiles need to be split "
+						        "further inside path trace (due to insufficient "
+						        "device-global-memory for split kernel to "
+						        "function) \n"
+						        "The current tile of dimensions %dx%d is split "
+						        "into tiles of dimension %dx%d for render \n",
+						        tile.w, tile.h,
+						        split_tile_size.x,
+						        split_tile_size.y);
+					}
+					/* Process all split tiles. */
+					for(int tile_iter = 0;
+					    tile_iter < to_path_trace_render_tiles.size();
+					    ++tile_iter)
+					{
+						path_trace(to_path_trace_render_tiles[tile_iter],
+						           max_render_feasible_tile_size);
+					}
+				}
+				else {
+					/* No splitting required; process the entire tile at once. */
+					/* Render feasible tile size is user-set-tile-size itself. */
+					max_render_feasible_tile_size.x =
+						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_X;
+					max_render_feasible_tile_size.y =
+						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_Y;
+					/* buffer_rng_state_stride is stride itself. */
+					SplitRenderTile split_tile(tile);
+					split_tile.buffer_rng_state_stride = tile.stride;
+					path_trace(split_tile, max_render_feasible_tile_size);
+				}
+				tile.sample = tile.start_sample + tile.num_samples;
+
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
+				task->release_tile(tile);
+			}
+		}
+	}
+
+protected:
+	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+	{
+		cl_mem ptr;
+		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
+		if(opencl_error(ciErr)) {
+			assert(0);
+		}
+		return ptr;
 	}
 };
 
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 {
-	return new OpenCLDevice(info, stats, background);
+	vector<OpenCLPlatformDevice> usable_devices;
+	opencl_get_usable_devices(&usable_devices);
+	assert(info.num < usable_devices.size());
+	OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+	char name[256];
+	if(clGetPlatformInfo(platform_device.platform_id,
+	                     CL_PLATFORM_NAME,
+	                     sizeof(name),
+	                     &name,
+	                     NULL) != CL_SUCCESS)
+	{
+		VLOG(1) << "Failed to retrieve platform name, using mega kernel.";
+		return new OpenCLDeviceMegaKernel(info, stats, background);
+	}
+	string platform_name = name;
+	cl_device_type device_type;
+	if(clGetDeviceInfo(platform_device.device_id,
+	                   CL_DEVICE_TYPE,
+	                   sizeof(cl_device_type),
+	                   &device_type,
+	                   NULL) != CL_SUCCESS)
+	{
+		VLOG(1) << "Failed to retrieve device type, using mega kernel,";
+		return new OpenCLDeviceMegaKernel(info, stats, background);
+	}
+	if(opencl_kernel_use_split(platform_name, device_type)) {
+		VLOG(1) << "Using split kernel.";
+		return new OpenCLDeviceSplitKernel(info, stats, background);
+	} else {
+		VLOG(1) << "Using mega kernel.";
+		return new OpenCLDeviceMegaKernel(info, stats, background);
+	}
 }
 
-bool device_opencl_init(void) {
+bool device_opencl_init(void)
+{
 	static bool initialized = false;
 	static bool result = false;
 
-	if (initialized)
+	if(initialized)
 		return result;
 
 	initialized = true;
 
-	// OpenCL disabled for now, only works with this environment variable set
-	if(!getenv("CYCLES_OPENCL_TEST")) {
-		result = false;
-	}
-	else {
-		result = clewInit() == CLEW_SUCCESS;
-	}
+	result = clewInit() == CLEW_SUCCESS;
 
 	return result;
 }
 
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
-	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
-	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;
-
-	/* get devices */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
-		return;
-	
-	platform_ids.resize(num_platforms);
-
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS)
-		return;
-
-	/* devices are numbered consecutively across platforms */
-	int num_base = 0;
-
-	for (int platform = 0; platform < num_platforms; platform++, num_base += num_devices) {
-		num_devices = 0;
-		if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), 0, NULL, &num_devices) != CL_SUCCESS || num_devices == 0)
+	vector<OpenCLPlatformDevice> usable_devices;
+	opencl_get_usable_devices(&usable_devices);
+	/* Devices are numbered consecutively across platforms. */
+	int num_devices = 0;
+	foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
+		cl_platform_id platform_id = platform_device.platform_id;
+		cl_device_id device_id = platform_device.device_id;
+		/* We always increment the device number, so there;s 1:1 mapping from
+		 * info.num to indexinside usable_devices vector.
+		 */
+		++num_devices;
+		char platform_name[256];
+		if(clGetPlatformInfo(platform_id,
+		                     CL_PLATFORM_NAME,
+		                     sizeof(platform_name),
+		                     &platform_name,
+		                     NULL) != CL_SUCCESS)
+		{
 			continue;
-
-		device_ids.resize(num_devices);
-
-		if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), num_devices, &device_ids[0], NULL) != CL_SUCCESS)
+		}
+		char device_name[1024] = "\0";
+		if(clGetDeviceInfo(device_id,
+		                   CL_DEVICE_NAME,
+		                   sizeof(device_name),
+		                   &device_name,
+		                   NULL) != CL_SUCCESS)
+		{
+			continue;
+		}
+		cl_device_type device_type;
+		if(clGetDeviceInfo(device_id,
+		                   CL_DEVICE_TYPE,
+		                   sizeof(cl_device_type),
+		                   &device_type,
+		                   NULL) != CL_SUCCESS)
+		{
 			continue;
+		}
+		DeviceInfo info;
+		info.type = DEVICE_OPENCL;
+		info.description = string_remove_trademark(string(device_name));
+		info.num = num_devices - 1;
+		info.id = string_printf("OPENCL_%d", info.num);
+		/* We don't know if it's used for display, but assume it is. */
+		info.display_device = true;
+		info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
+		info.pack_images = true;
+		info.use_split_kernel = opencl_kernel_use_split(platform_name,
+		                                                device_type);
+		devices.push_back(info);
+	}
+}
 
-		char pname[256];
-		clGetPlatformInfo(platform_ids[platform], CL_PLATFORM_NAME, sizeof(pname), &pname, NULL);
-		string platform_name = pname;
+string device_opencl_capabilities(void)
+{
+	string result = "";
+	string error_msg = "";  /* Only used by opencl_assert(), but in the future
+	                         * it could also be nicely reported to the console.
+	                         */
+	cl_uint num_platforms = 0;
+	opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms));
+	if(num_platforms == 0) {
+		return "No OpenCL platforms found\n";
+	}
+	result += string_printf("Number of platforms: %u\n", num_platforms);
 
-		/* add devices */
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char name[1024] = "\0";
+	vector<cl_platform_id> platform_ids;
+	platform_ids.resize(num_platforms);
+	opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
+
+#define APPEND_STRING_INFO(func, id, name, what) \
+	do { \
+		char data[1024] = "\0"; \
+		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
+		result += string_printf("%s: %s\n", name, data); \
+	} while(false)
+#define APPEND_PLATFORM_STRING_INFO(id, name, what) \
+	APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
+#define APPEND_DEVICE_STRING_INFO(id, name, what) \
+	APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
 
-			if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
-				continue;
+	vector<cl_device_id> device_ids;
+	for (cl_uint platform = 0; platform < num_platforms; ++platform) {
+		cl_platform_id platform_id = platform_ids[platform];
+
+		result += string_printf("Platform #%u\n", platform);
 
-			DeviceInfo info;
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
+		APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
 
-			info.type = DEVICE_OPENCL;
-			info.description = string(name);
-			info.num = num_base + num;
-			info.id = string_printf("OPENCL_%d", info.num);
-			/* we don't know if it's used for display, but assume it is */
-			info.display_device = true;
-			info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
-			info.pack_images = true;
+		cl_uint num_devices = 0;
+		opencl_assert(clGetDeviceIDs(platform_ids[platform],
+		                             CL_DEVICE_TYPE_ALL,
+		                             0,
+		                             NULL,
+		                             &num_devices));
+		result += string_printf("\tNumber of devices: %u\n", num_devices);
 
-			devices.push_back(info);
+		device_ids.resize(num_devices);
+		opencl_assert(clGetDeviceIDs(platform_ids[platform],
+		                             CL_DEVICE_TYPE_ALL,
+		                             num_devices,
+		                             &device_ids[0],
+		                             NULL));
+		for (cl_uint device = 0; device < num_devices; ++device) {
+			cl_device_id device_id = device_ids[device];
+
+			result += string_printf("\t\tDevice: #%u\n", device);
+
+			APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+			APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
+			APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
+			APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
+			APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
+			APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
 		}
 	}
-}
 
-string device_opencl_capabilities(void)
-{
-	/* TODO(sergey): Not implemented yet. */
-	return "";
+#undef APPEND_STRING_INFO
+#undef APPEND_PLATFORM_STRING_INFO
+#undef APPEND_DEVICE_STRING_INFO
+
+	return result;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 2fe2f334176..d527540f300 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -111,7 +111,7 @@ void DeviceTask::update_progress(RenderTile *rtile)
 	if(update_tile_sample) {
 		double current_time = time_dt();
 
-		if (current_time - last_update_time >= 1.0) {
+		if(current_time - last_update_time >= 1.0) {
 			update_tile_sample(*rtile);
 
 			last_update_time = current_time;
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 84945bcf9a5..834ea60988a 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -57,14 +57,15 @@ public:
 
 	void update_progress(RenderTile *rtile);
 
-	boost::function<bool(Device *device, RenderTile&)> acquire_tile;
-	boost::function<void(void)> update_progress_sample;
-	boost::function<void(RenderTile&)> update_tile_sample;
-	boost::function<void(RenderTile&)> release_tile;
-	boost::function<bool(void)> get_cancel;
+	function<bool(Device *device, RenderTile&)> acquire_tile;
+	function<void(void)> update_progress_sample;
+	function<void(RenderTile&)> update_tile_sample;
+	function<void(RenderTile&)> release_tile;
+	function<bool(void)> get_cancel;
 
 	bool need_finish_queue;
 	bool integrator_branched;
+	int2 requested_tile_size;
 protected:
 	double last_update_time;
 };
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index a25eb3f5b50..18d1360542c 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,3 +1,4 @@
+remove_extra_strict_flags()
 
 set(INC
 	.
@@ -11,9 +12,20 @@ set(INC_SYS
 )
 
 set(SRC
-	kernel.cpp
-	kernel.cl
-	kernel.cu
+	kernels/cpu/kernel.cpp
+	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_queue_enqueue.cl
+	kernels/opencl/kernel_scene_intersect.cl
+	kernels/opencl/kernel_lamp_emission.cl
+	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_shader_eval.cl
+	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_direct_lighting.cl
+	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_next_iteration_setup.cl
+	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/cuda/kernel.cu
 )
 
 set(SRC_HEADERS
@@ -35,17 +47,22 @@ set(SRC_HEADERS
 	kernel_montecarlo.h
 	kernel_passes.h
 	kernel_path.h
+	kernel_path_branched.h
+	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
 	kernel_path_volume.h
 	kernel_projection.h
+	kernel_queues.h
 	kernel_random.h
 	kernel_shader.h
+	kernel_shaderdata_vars.h
 	kernel_shadow.h
 	kernel_subsurface.h
 	kernel_textures.h
 	kernel_types.h
 	kernel_volume.h
+	kernel_work_stealing.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -67,6 +84,7 @@ set(SRC_CLOSURE_HEADERS
 	closure/emissive.h
 	closure/volume.h
 )
+
 set(SRC_SVM_HEADERS
 	svm/svm.h
 	svm/svm_attribute.h
@@ -118,6 +136,7 @@ set(SRC_GEOM_HEADERS
 	geom/geom_bvh_subsurface.h
 	geom/geom_bvh_traversal.h
 	geom/geom_bvh_volume.h
+	geom/geom_bvh_volume_all.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
@@ -128,12 +147,14 @@ set(SRC_GEOM_HEADERS
 	geom/geom_qbvh_subsurface.h
 	geom/geom_qbvh_traversal.h
 	geom/geom_qbvh_volume.h
+	geom/geom_qbvh_volume_all.h
 	geom/geom_triangle.h
 	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
 )
 
 set(SRC_UTIL_HEADERS
+	../util/util_atomic.h
 	../util/util_color.h
 	../util/util_half.h
 	../util/util_math.h
@@ -141,6 +162,21 @@ set(SRC_UTIL_HEADERS
 	../util/util_transform.h
 	../util/util_types.h
 )
+
+set(SRC_SPLIT_HEADERS
+	split/kernel_background_buffer_update.h
+	split/kernel_data_init.h
+	split/kernel_direct_lighting.h
+	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_lamp_emission.h
+	split/kernel_next_iteration_setup.h
+	split/kernel_scene_intersect.h
+	split/kernel_shader_eval.h
+	split/kernel_shadow_blocked.h
+	split/kernel_split_common.h
+	split/kernel_sum_all_radiance.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -166,12 +202,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
 		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
 			set(cuda_cubin kernel_experimental_${arch}.cubin)
 		else()
 			set(cuda_extra_flags "")
@@ -192,7 +228,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			COMMAND ${CUDA_NVCC_EXECUTABLE}
 					-arch=${arch}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -240,28 +276,28 @@ include_directories(SYSTEM ${INC_SYS})
 
 if(CXX_HAS_SSE)
 	list(APPEND SRC
-		kernel_sse2.cpp
-		kernel_sse3.cpp
-		kernel_sse41.cpp
+		kernels/cpu/kernel_sse2.cpp
+		kernels/cpu/kernel_sse3.cpp
+		kernels/cpu/kernel_sse41.cpp
 	)
 
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
 	list(APPEND SRC
-		kernel_avx.cpp
+		kernels/cpu/kernel_avx.cpp
 	)
-	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
 	list(APPEND SRC
-		kernel_avx2.cpp
+		kernels/cpu/kernel_avx2.cpp
 	)
-	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
@@ -280,11 +316,23 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index c0d969e24ae..e8d51013924 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -57,8 +57,9 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     build_dir = os.path.join(root_build_dir, 'intern/cycles/kernel')
 
     # source directories and files
+    kernel_file_rel = os.path.join("kernels", "cuda", "kernel.cu")
     source_dir = Dir('.').srcnode().path
-    kernel_file = os.path.join(source_dir, "kernel.cu")
+    kernel_file = os.path.join(source_dir, kernel_file_rel)
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
     geom_dir = os.path.join(source_dir, "../geom")
@@ -83,11 +84,11 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
         nvcc_flags += " -D__KERNEL_DEBUG__"
 
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
+    dependencies = [kernel_file_rel] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     configs = (("kernel_%s.cubin", ''),
-               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_EXPERIMENTAL__'))
 
     # add command for each cuda architecture
     for arch in cuda_archs:
@@ -105,7 +106,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
             else:
                 command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
 
-            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Command(cubin_file, kernel_file_rel, command)
             kernel.Depends(cubin_file, dependencies)
 
             kernel_binaries.append(cubin_file)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 2b9e2a4e44d..558aa0dc6a9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;*/
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
 #endif
 
-	if(dot(sd->Ng, omega_in) >= 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;*/
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index acc477246d2..8d7d533d6f8 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -69,6 +69,9 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 
 	float out = 0.0f;
 
+	if(fmaxf(sc->data0, sc->data1) <= 1e-4f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
 	if(NdotI > 0.0f && NdotO > 0.0f) {
 		NdotI = fmaxf(NdotI, 1e-6f);
 		NdotO = fmaxf(NdotO, 1e-6f);
@@ -190,8 +193,15 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* reflect I on H to get omega_in */
 		*omega_in = -I + (2.0f * HdotI) * H;
 
-		/* leave the rest to eval_reflect */
-		*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		if(fmaxf(sc->data0, sc->data1) <= 1e-4f) {
+			/* Some high number for MIS. */
+			*pdf = 1e6f;
+			*eval = make_float3(1e6f, 1e6f, 1e6f);
+		}
+		else {
+			/* leave the rest to eval_reflect */
+			*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		}
 
 #ifdef __RAY_DIFFERENTIALS__
 		/* just do the reflection thing for now */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 580f50d5dd6..f1a26650078 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -59,7 +59,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		float cosHO = fabsf(dot(I, H));
 
 		if(!(fabsf(cosNH) < 1.0f-1e-5f && cosHO > 1e-5f))
-			return make_float3(0, 0, 0);
+			return make_float3(0.0f, 0.0f, 0.0f);
 
 		float cosNHdivHO = cosNH / cosHO;
 		cosNHdivHO = fmaxf(cosNHdivHO, 1e-5f);
@@ -80,7 +80,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -114,7 +114,7 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng,
 
 			float sinNH2 = 1 - cosNH * cosNH;
 			float sinNH4 = sinNH2 * sinNH2;
-			float cotangent2 =  (cosNH * cosNH) / sinNH2;
+			float cotangent2 = (cosNH * cosNH) / sinNH2;
 
 			float D = expf(-cotangent2 * m_invsigma2) * m_invsigma2 * M_1_PI_F / sinNH4;
 			float G = min(1.0f, min(fac1, fac2)); // TODO: derive G from D analytically
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index cdaf84f1750..e0287e7655a 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -52,6 +52,8 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
+	sc->data0 = 0.0f;
+	sc->data1 = 0.0f;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 71086f2e764..6a50bbed3b3 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,79 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Approximate erf and erfinv implementations.
- * Implementation comes straight from Wikipedia:
- *
- * http://en.wikipedia.org/wiki/Error_function
- *
- * Some constants are baked into the code.
- */
-
-ccl_device_inline float approx_erff_do(float x)
-{
-	/* Such a clamp doesn't give much distortion to the output value
-	 * and gives quite a few of the speedup.
-	 */
-	if(x > 3.0f) {
-		return 1.0f;
-	}
-	float t = 1.0f / (1.0f + 0.47047f*x);
-	return  (1.0f -
-	         t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x));
-}
-
-ccl_device_inline float approx_erff(float x)
-{
-	if(x >= 0.0f) {
-		return approx_erff_do(x);
-	}
-	else {
-		return -approx_erff_do(-x);
-	}
-}
-
-ccl_device_inline float approx_erfinvf_do(float x)
-{
-	if(x <= 0.7f) {
-		const float x2 = x * x;
-		const float a1 =  0.886226899f;
-		const float a2 = -1.645349621f;
-		const float a3 =  0.914624893f;
-		const float a4 = -0.140543331f;
-		const float b1 = -2.118377725f;
-		const float b2 =  1.442710462f;
-		const float b3 = -0.329097515f;
-		const float b4 =  0.012229801f;
-		return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) /
-		          ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f);
-	}
-	else {
-		const float c1 = -1.970840454f;
-		const float c2 = -1.624906493f;
-		const float c3 =  3.429567803f;
-		const float c4 =  1.641345311f;
-		const float d1 =  3.543889200f;
-		const float d2 =  1.637067800f;
-		const float z = sqrtf(-logf((1.0f - x) * 0.5f));
-		return (((c4 * z + c3) * z + c2) * z + c1) /
-		        ((d2 * z + d1) * z + 1.0f);
-	}
-}
-
-ccl_device_inline float approx_erfinvf(float x)
-{
-	if(x >= 0.0f) {
-		return approx_erfinvf_do(x);
-	}
-	else {
-		return -approx_erfinvf_do(-x);
-	}
-}
-
-/* Beckmann and GGX microfacet importance sampling from:
- * 
- * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
- * E. Heitz and E. d'Eon, EGSR 2014 */
+/* Beckmann and GGX microfacet importance sampling. */
 
 ccl_device_inline void microfacet_beckmann_sample_slopes(
 	KernelGlobals *kg,
@@ -128,64 +56,71 @@ ccl_device_inline void microfacet_beckmann_sample_slopes(
 	/* precomputations */
 	const float tan_theta_i = sin_theta_i/cos_theta_i;
 	const float inv_a = tan_theta_i;
-	const float a = 1.0f/inv_a;
-	const float erf_a = approx_erff(a);
-	const float exp_a2 = expf(-a*a);
+	const float cot_theta_i = 1.0f/tan_theta_i;
+	const float erf_a = fast_erff(cot_theta_i);
+	const float exp_a2 = expf(-cot_theta_i*cot_theta_i);
 	const float SQRT_PI_INV = 0.56418958354f;
 	const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a);
 	const float G1 = 1.0f/(1.0f + Lambda); /* masking */
 
 	*G1i = G1;
 
-#if 0
-	const float C = 1.0f - G1 * erf_a;
-
-	/* sample slope X */
-	if(randu < C) {
-		/* rescale randu */
-		randu = randu / C;
-		const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2;
-		const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a);
-		const float p = w_1 / (w_1 + w_2);
+#if defined(__KERNEL_GPU__)
+	/* Based on paper from Wenzel Jakob
+	 * An Improved Visible Normal Sampling Routine for the Beckmann Distribution
+	 *
+	 * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
+	 *
+	 * Reformulation from OpenShadingLanguage which avoids using inverse
+	 * trigonometric functions.
+	 */
 
-		if(randu < p) {
-			randu = randu / p;
-			*slope_x = -sqrtf(-logf(randu*exp_a2));
-		}
-		else {
-			randu = (randu - p) / (1.0f - p);
-			*slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a);
-		}
+	/* Sample slope X.
+	 *
+	 * Compute a coarse approximation using the approximation:
+	 *   exp(-ierf(x)^2) ~= 1 - x * x
+	 *   solve y = 1 + b + K * (1 - b * b)
+	 */
+	float K = tan_theta_i * SQRT_PI_INV;
+	float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
+	float y_exact  = randu * (1.0f + erf_a + K * exp_a2);
+	float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
+
+	/* Perform newton step to refine toward the true root. */
+	float inv_erf = fast_ierff(b);
+	float value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+	/* Check if we are close enough already,
+	 * this also avoids NaNs as we get close to the root.
+	 */
+	if(fabsf(value) > 1e-6f) {
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */
+		inv_erf = fast_ierff(b);
+		value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */
+		/* Compute the slope from the refined value. */
+		*slope_x = fast_ierff(b);
 	}
 	else {
-		/* rescale randu */
-		randu = (randu - C) / (1.0f - C);
-		*slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a);
-
-		const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i);
-
-		if(randv > p) {
-			*slope_x = -(*slope_x);
-			randv = (randv - p) / (1.0f - p);
-		}
-		else
-			randv = randv / p;
+		/* We are close enough already. */
+		*slope_x = inv_erf;
 	}
-
-	/* sample slope Y */
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #else
-	/* use precomputed table, because it better preserves stratification
-	 * of the random number pattern */
+	/* Use precomputed table on CPU, it gives better perfomance. */
 	int beckmann_table_offset = kernel_data.tables.beckmann_offset;
 
 	*slope_x = lookup_table_read_2D(kg, randu, cos_theta_i,
 		beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #endif
-
 }
 
+/* GGX microfacet importance sampling from:
+ *
+ * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
+ * E. Heitz and E. d'Eon, EGSR 2014
+ */
+
 ccl_device_inline void microfacet_ggx_sample_slopes(
 	const float cos_theta_i, const float sin_theta_i,
 	float randu, float randv, float *slope_x, float *slope_y,
@@ -300,7 +235,7 @@ ccl_device_inline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
@@ -310,8 +245,8 @@ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
@@ -320,7 +255,7 @@ ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
 
 ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
@@ -342,7 +277,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -421,7 +356,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -433,13 +368,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0); /* vectors on same side -- not possible */
+		return make_float3(0.0f, 0.0f, 0.0f); /* vectors on same side -- not possible */
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
@@ -653,7 +588,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
@@ -662,8 +597,8 @@ ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 
 ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
@@ -671,7 +606,7 @@ ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
 
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
@@ -692,7 +627,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -774,7 +709,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -786,13 +721,13 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index c476d4ca4e2..61b7cb11b02 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -37,7 +37,7 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 
 	sc->type = CLOSURE_BSDF_OREN_NAYAR_ID;
 
-	sigma = clamp(sigma, 0.0f, 1.0f);
+	sigma = saturate(sigma);
 
 	float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) * sigma);
 
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index f9f263719e9..1ab15eee954 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -51,9 +51,9 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 
 ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 {
-	sc->data0 = max(sc->data0, 0.0f);
-	
 	sc->type = CLOSURE_BSDF_PHONG_RAMP_ID;
+	sc->data0 = max(sc->data0, 0.0f);
+	sc->data1 = 0.0f;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index df03942638f..e5b6ab93a64 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -40,8 +40,8 @@ CCL_NAMESPACE_BEGIN
 ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
@@ -120,8 +120,8 @@ ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, floa
 ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index b6de2da8c71..f817dcd5f2d 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -30,8 +30,8 @@ ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 		return flag;
 	}
 	else {
-		sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* texture blur */
-		sc->T.x = clamp(sc->T.x, 0.0f, 1.0f); /* sharpness */
+		sc->data1 = saturate(sc->data1); /* texture blur */
+		sc->T.x = saturate(sc->T.x); /* sharpness */
 		sc->type = type;
 
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
@@ -157,7 +157,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 	float x = 0.25f;
 	int i;
 
-	for (i = 0; i < max_iteration_count; i++) {
+	for(i = 0; i < max_iteration_count; i++) {
 		float x2 = x*x;
 		float x3 = x2*x;
 		float nx = (1.0f - x);
@@ -168,7 +168,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 		if(fabsf(f) < tolerance || f_ == 0.0f)
 			break;
 
-		x = clamp(x - f/f_, 0.0f, 1.0f);
+		x = saturate(x - f/f_);
 	}
 
 	return x;
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 439610546e5..4d71ba50ec3 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -107,18 +107,9 @@ ccl_device int volume_absorption_setup(ShaderClosure *sc)
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf)
 {
-	float3 eval;
+	kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
 
-	switch(sc->type) {
-		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-			break;
-		default:
-			eval = make_float3(0.0f, 0.0f, 0.0f);
-			break;
-	}
-
-	return eval;
+	return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd, const ShaderClosure *sc, float randu,
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index bf0d86e6206..5ab900d47aa 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -22,7 +22,9 @@
 #define BVH_STACK_SIZE 192
 #define BVH_QSTACK_SIZE 384
 #define BVH_NODE_SIZE 4
+#define BVH_NODE_LEAF_SIZE 1
 #define BVH_QNODE_SIZE 7
+#define BVH_QNODE_LEAF_SIZE 1
 #define TRI_NODE_SIZE 3
 
 /* silly workaround for float extended precision that happens when compiling
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 9ac16e86085..c7364e9edac 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
 {
-	if(sd->object == PRIM_NONE)
+	if(ccl_fetch(sd, object) == PRIM_NONE)
 		return (int)ATTR_STD_NOT_FOUND;
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-	attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+	attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui
 
 	*elem = (AttributeElement)attr_map.y;
 	
-	if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+	if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
 		return ATTR_STD_NOT_FOUND;
 
 	/* return result */
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index c0eefcd9c7f..3d0d406dd0b 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -115,7 +115,39 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_subsurface.h"
 #endif
 
-/* Record all BVH intersection for shadows */
+/* Volume BVH traversal */
+
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+/* Record all intersections - Shadow BVH traversal */
 
 #if defined(__SHADOW_RECORD_ALL__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all
@@ -147,36 +179,36 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_shadow.h"
 #endif
 
-/* Camera inside Volume BVH intersection */
+/* Record all intersections - Volume BVH traversal  */
 
-#if defined(__VOLUME__)
-#define BVH_FUNCTION_NAME bvh_intersect_volume
+#if defined(__VOLUME_RECORD_ALL__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #define BVH_FUNCTION_FEATURES 0
-#include "geom_bvh_volume.h"
+#include "geom_bvh_volume_all.h"
 #endif
 
-#if defined(__VOLUME__) && defined(__INSTANCING__)
-#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#include "geom_bvh_volume.h"
+#include "geom_bvh_volume_all.h"
 #endif
 
-#if defined(__VOLUME__) && defined(__HAIR__)
-#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#include "geom_bvh_volume.h"
+#include "geom_bvh_volume_all.h"
 #endif
 
-#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#include "geom_bvh_volume.h"
+#include "geom_bvh_volume_all.h"
 #endif
 
-#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#include "geom_bvh_volume.h"
+#include "geom_bvh_volume_all.h"
 #endif
 
 #undef BVH_FEATURE
@@ -330,6 +362,37 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 }
 #endif
 
+#ifdef __VOLUME_RECORD_ALL__
+ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_all_hair_motion(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_all_hair(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume_all(kg, ray, isect, max_hits);
+}
+#endif
+
 
 /* Ray offset to avoid self intersection.
  *
@@ -384,5 +447,21 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 193f49074a3..e4cba99dc96 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -200,7 +200,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -226,7 +226,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
 #if BVH_FEATURE(BVH_MOTION)
@@ -264,7 +264,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
 							{
-								shader =  kernel_tex_fetch(__tri_shader, prim);
+								shader = kernel_tex_fetch(__tri_shader, prim);
 							}
 #ifdef __HAIR__
 							else {
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index 290297ef5c5..a73139f9c88 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -187,7 +187,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -210,7 +210,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
 								if(tri_object != subsurface_object)
 									continue;
-								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
 							}
 							break;
 						}
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 0298e687de2..73d79fd78ee 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -76,6 +76,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 #if defined(__KERNEL_DEBUG__)
 	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
 #endif
 
 #if defined(__KERNEL_SSE2__)
@@ -248,7 +249,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -269,7 +270,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								isect->num_traversal_steps++;
 #endif
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr)) {
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
 									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
@@ -362,6 +363,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
index 0862812a170..41c784869f2 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -188,7 +188,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+3);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -213,7 +213,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
-								triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr);
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
 							}
 							break;
 						}
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
new file mode 100644
index 00000000000..b6db36f4b17
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
@@ -0,0 +1,454 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#include "geom_qbvh_volume_all.h"
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_array->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					bool hit;
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index ac6c6ec4929..9653ad8f1bb 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
+	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
 }
 
 /* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
+		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
 #endif
 	}
 
@@ -442,12 +442,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		float r_ext = mw_extension + r_curr;
 		float coverage = 1.0f;
 
-		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
 			/* the bounding box does not overlap the square centered at O */
 			tree += level;
 			level = tree & -tree;
 		}
-		else if (level == 1) {
+		else if(level == 1) {
 
 			/* the maximum recursion depth is reached.
 			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
@@ -459,13 +459,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 			if(flags & CURVE_KN_RIBBONS) {
 				float3 tg = (p_en - p_st);
 				float w = tg.x * tg.x + tg.y * tg.y;
-				if (w == 0) {
+				if(w == 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
@@ -474,17 +474,17 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
-				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
-				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -500,13 +500,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					float d0 = d - r_curr;
 					float d1 = d + r_curr;
 					float inv_mw_extension = 1.0f/mw_extension;
-					if (d0 >= 0)
+					if(d0 >= 0)
 						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
 					else // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
 				
-				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -548,7 +548,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
 				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
 				float td = tb*tb - 4*cyla*tc;
-				if (td < 0.0f) {
+				if(td < 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -559,10 +559,10 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				t = tcentre + correction;
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
 
 				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
@@ -570,14 +570,14 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					t = tcentre + correction;
 				}			
 
-				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 
 				float w = (zcentre + (tg.z * correction)) * invl;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
 
@@ -777,7 +777,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
 	float td = tb*tb - 4*a*tc;
 
-	if (td < 0.0f)
+	if(td < 0.0f)
 		return false;
 
 	float rootd = 0.0f;
@@ -818,7 +818,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 
 		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
 
-			if (flags & CURVE_KN_ENCLOSEFILTER) {
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
 				float enc_ratio = 1.01f;
 				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
 					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
@@ -890,7 +890,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -903,7 +903,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -914,14 +914,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -933,43 +933,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = isect->u;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
+			ccl_fetch(sd, Ng) = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 	else {
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -980,39 +980,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = dot(dif,tg)/l;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
-		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
+			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
 
 			/* adjustment for changing radius */
-			if (gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+			if(gd != 0.0f) {
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
+	ccl_fetch(sd, dPdu) = tg;
+	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index d3297e05c67..86f93f242a1 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -134,7 +134,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 			return P;
 		}
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -161,7 +161,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -187,7 +187,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -213,7 +213,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
 	/* get shader */
-	sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 	/* get motion info */
 	int numsteps, numverts;
-	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
 
 	/* figure out which steps we need to fetch and their interpolation factor */
 	int maxstep = numsteps*2;
-	int step = min((int)(sd->time*maxstep), maxstep-1);
-	float t = sd->time*maxstep - step;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;
 
 	/* find attribute */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 	/* fetch vertex coordinates */
 	float3 verts[3], next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
 
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #ifdef __SUBSURFACE__
 	if(!subsurface)
 #endif
-		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
 #ifdef __SUBSURFACE__
 	else
-		sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
 #endif
 
 	/* compute face normal */
 	float3 Ng;
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	else
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
-	sd->Ng = Ng;
-	sd->N = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;
 
 	/* compute derivatives of P w.r.t. uv */
 #ifdef __DPDU__
-	sd->dPdu = (verts[0] - verts[2]);
-	sd->dPdv = (verts[1] - verts[2]);
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
 #endif
 
 	/* compute smooth normal */
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 		/* find attribute */
 		AttributeElement elem;
-		int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 		/* fetch vertex coordinates */
@@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 
 		/* interpolate between vertices */
-		float u = sd->u;
-		float v = sd->v;
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
 		float w = 1.0f - u - v;
-		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 79a56683454..9d0a008fff1 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_tfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_itfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_tfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_itfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(sd->object == OBJECT_NONE)
+	if(ccl_fetch(sd, object) == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -243,7 +243,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 {
 	if(object == OBJECT_NONE)
-		return 0.0f;
+		return 0;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
 	float4 f = kernel_tex_fetch(__objects, offset);
@@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -453,7 +453,7 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	Transform itfm;
 	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@@ -497,7 +497,7 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object,
 
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	if(*t != FLT_MAX)
 		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj
 
 #endif
 
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+                                                      const ShaderData *sd,
+                                                      ccl_addr_space float3 *D)
+{
+	float3 private_D = *D;
+	object_dir_transform(kg, sd, &private_D);
+	*D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+                                                         const ShaderData *sd,
+                                                         ccl_addr_space float3 *N)
+{
+	float3 private_N = *N;
+	object_normal_transform(kg, sd, &private_N);
+	*N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+#  define object_dir_transform_auto object_dir_transform
+#  define object_normal_transform_auto object_normal_transform
+#else
+#  define object_dir_transform_auto object_dir_transform_addrspace
+#  define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index b52ec7ef1b2..30f12d32355 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
 
 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE)
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(sd->N, normalize(cross(data, sd->N)));
+		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED))
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
 			object_position_transform(kg, sd, &center);
 	}
 	else
 #endif
-		center = sd->P;
+		center = ccl_fetch(sd, P);
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	if(offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
-		int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+		int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
 
 		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
 		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -184,17 +184,17 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
 
 	/* camera motion, for perspective/orthographic motion.pre/post will be a
 	 * world-to-raster matrix, for panorama it's world-to-camera */
-	if (kernel_data.cam.type != CAMERA_PANORAMA) {
+	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		tfm = kernel_data.cam.worldtoraster;
 		motion_center = transform_perspective(&tfm, center);
 
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
index 4233ff15c86..f79b2ed9f34 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -155,11 +155,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					++stackPtr;
 					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
 					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = c1;
+					traversalStack[stackPtr].dist = d1;
 					++stackPtr;
 					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
 					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = c0;
+					traversalStack[stackPtr].dist = d0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
@@ -206,7 +206,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* If node is leaf, fetch triangle list. */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
 #ifdef __VISIBILITY_FLAG__
 				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
 					/* Pop. */
@@ -241,7 +241,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
 #if BVH_FEATURE(BVH_MOTION)
@@ -279,7 +279,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
 							{
-								shader =  kernel_tex_fetch(__tri_shader, prim);
+								shader = kernel_tex_fetch(__tri_shader, prim);
 							}
 #ifdef __HAIR__
 							else {
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
index 62598115fa3..d85e1a4691e 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -202,7 +202,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* If node is leaf, fetch triangle list. */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -226,7 +226,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								if(tri_object != subsurface_object) {
 									continue;
 								}
-								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
 							}
 							break;
 						}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
index 99d2fb20837..7e356ea062b 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -80,6 +80,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 #if defined(__KERNEL_DEBUG__)
 	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
 #endif
 
 	ssef tnear(0.0f), tfar(ray->t);
@@ -185,6 +186,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					if(traverseChild == 0) {
 						if(d1 < d0) {
 							nodeAddr = c1;
+							nodeDist = d1;
 							++stackPtr;
 							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
 							traversalStack[stackPtr].addr = c0;
@@ -193,6 +195,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						}
 						else {
 							nodeAddr = c0;
+							nodeDist = d0;
 							++stackPtr;
 							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
 							traversalStack[stackPtr].addr = c1;
@@ -260,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* If node is leaf, fetch triangle list. */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
 
 #ifdef __VISIBILITY_FLAG__
 				if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
@@ -296,7 +299,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								isect->num_traversal_steps++;
 #endif
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr)) {
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
@@ -377,6 +380,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					traversalStack[stackPtr].dist = -FLT_MAX;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
index 2c396e99fc4..d8cfa3a4061 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@@ -95,10 +95,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		do {
 			/* Traverse internal nodes. */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-#if defined(__KERNEL_DEBUG__)
-				isect->num_traversal_steps++;
-#endif
-
 				ssef dist;
 				int traverseChild = qbvh_node_intersect(kg,
 				                                        tnear,
@@ -208,7 +204,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* If node is leaf, fetch triangle list. */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
@@ -234,7 +230,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr);
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
 							}
 							break;
 						}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
new file mode 100644
index 00000000000..d5131919944
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
@@ -0,0 +1,446 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index dd3928682e3..995dfac5b09 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 	
 	/* return normal */
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		return normalize(cross(v2 - v0, v1 - v0));
 	else
 		return normalize(cross(v1 - v0, v2 - v0));
@@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
 	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
 		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
 		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float3 f0, f1, f2;
 
 		if(elem == ATTR_ELEMENT_CORNER) {
@@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index c9e30a451da..3ef918dc842 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/* Triangle/Ray intersections .
+/* Triangle/Ray intersections.
  *
  * For BVH ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage.
@@ -49,18 +49,27 @@ typedef struct IsectPrecalc {
 	float Sx, Sy, Sz;
 } IsectPrecalc;
 
-/* Workaround for CUDA toolkit 6.5.16. */
-#if defined(__KERNEL_CPU__) || !defined(__KERNEL_CUDA_EXPERIMENTAL__) || __CUDA_ARCH__ < 500
+#if defined(__KERNEL_CUDA__)
 #  if (defined(i386) || defined(_M_IX86))
+#    if __CUDA_ARCH__ > 500
 ccl_device_noinline
-#  else
+#    else  /* __CUDA_ARCH__ > 500 */
 ccl_device_inline
-#  endif
-#else
+#    endif  /* __CUDA_ARCH__ > 500 */
+#  else  /* (defined(i386) || defined(_M_IX86)) */
+#    if defined(__KERNEL_EXPERIMENTAL__) && (__CUDA_ARCH__ >= 500)
 ccl_device_noinline
-#endif
+#    else
+ccl_device_inline
+#    endif
+#  endif  /* (defined(i386) || defined(_M_IX86)) */
+#elif defined(__KERNEL_OPENCL_APPLE__)
+ccl_device_noinline
+#else  /* defined(__KERNEL_OPENCL_APPLE__) */
+ccl_device_inline
+#endif  /* defined(__KERNEL_OPENCL_APPLE__) */
 void triangle_intersect_precalc(float3 dir,
-                                 IsectPrecalc *isect_precalc)
+                                IsectPrecalc *isect_precalc)
 {
 	/* Calculate dimension where the ray direction is maximal. */
 	int kz = util_max_axis(make_float3(fabsf(dir.x),
@@ -77,10 +86,10 @@ void triangle_intersect_precalc(float3 dir,
 	}
 
 	/* Calculate the shear constants. */
-	float inf_dir_z = 1.0f / IDX(dir, kz);
-	isect_precalc->Sx = IDX(dir, kx) * inf_dir_z;
-	isect_precalc->Sy = IDX(dir, ky) * inf_dir_z;
-	isect_precalc->Sz = inf_dir_z;
+	float inv_dir_z = 1.0f / IDX(dir, kz);
+	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
+	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
+	isect_precalc->Sz = inv_dir_z;
 
 	/* Store the dimensions. */
 	isect_precalc->kx = kx;
@@ -98,7 +107,6 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
                                           const IsectPrecalc *isect_precalc,
                                           Intersection *isect,
                                           float3 P,
-                                          float3 dir,
                                           uint visibility,
                                           int object,
                                           int triAddr)
@@ -111,14 +119,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	float3 tri[3];
-	tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0));
-	tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1));
-	tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2));
-
-	const float3 A = tri[0] - P;
-	const float3 B = tri[1] - P;
-	const float3 C = tri[2] - P;
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
 
 	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
 	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
@@ -155,8 +161,8 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	 */
 	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
 	const float sign_T = xor_signmast(T, sign_mask);
-	if ((sign_T < 0.0f) ||
-	    (sign_T > isect->t * xor_signmast(det, sign_mask)))
+	if((sign_T < 0.0f) ||
+	   (sign_T > isect->t * xor_signmast(det, sign_mask)))
 	{
 		return false;
 	}
@@ -191,7 +197,6 @@ ccl_device_inline void triangle_intersect_subsurface(
         const IsectPrecalc *isect_precalc,
         Intersection *isect_array,
         float3 P,
-        float3 dir,
         int object,
         int triAddr,
         float tmax,
@@ -207,14 +212,12 @@ ccl_device_inline void triangle_intersect_subsurface(
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	float3 tri[3];
-	tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0));
-	tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1));
-	tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2));
-
-	const float3 A = tri[0] - P;
-	const float3 B = tri[1] - P;
-	const float3 C = tri[2] - P;
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
 
 	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
 	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
@@ -249,13 +252,10 @@ ccl_device_inline void triangle_intersect_subsurface(
 	/* Calculate scaled z−coordinates of vertices and use them to calculate
 	 * the hit distance.
 	 */
-	const float Az = Sz * A_kz;
-	const float Bz = Sz * B_kz;
-	const float Cz = Sz * C_kz;
-	const float T = U * Az + V * Bz + W * Cz;
-
-	if ((xor_signmast(T, sign_mask) < 0.0f) ||
-	    (xor_signmast(T, sign_mask) > tmax * xor_signmast(det, sign_mask)))
+	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+	const float sign_T = xor_signmast(T, sign_mask);
+	if((sign_T < 0.0f) ||
+	   (sign_T > tmax * xor_signmast(det, sign_mask)))
 	{
 		return;
 	}
@@ -315,7 +315,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -327,14 +327,12 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	P = P + D*t;
 
-	float3 tri[3];
-	tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0));
-	tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1));
-	tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2));
-
-	float3 edge1 = tri[0] - tri[2];
-	float3 edge2 = tri[1] - tri[2];
-	float3 tvec = P - tri[2];
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
 	float3 qvec = cross(tvec, edge1);
 	float3 pvec = cross(D, edge2);
 	float rt = dot(edge2, qvec) / dot(edge1, pvec);
@@ -343,7 +341,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -372,7 +370,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -386,14 +384,12 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	P = P + D*t;
 
-	float3 tri[3];
-	tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0));
-	tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1));
-	tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2));
-
-	float3 edge1 = tri[0] - tri[2];
-	float3 edge2 = tri[1] - tri[2];
-	float3 tvec = P - tri[2];
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
 	float3 qvec = cross(tvec, edge1);
 	float3 pvec = cross(D, edge2);
 	float rt = dot(edge2, qvec) / dot(edge1, pvec);
@@ -402,7 +398,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index c33509fbf4f..c72afa2a3a4 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -60,7 +60,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 #endif
 
 	if(dx) *dx = 0.0f;
-	if(dx) *dy = 0.0f;
+	if(dy) *dy = 0.0f;
 
 	/* todo: support float textures to lower memory usage for single floats */
 	return average(float4_to_float3(r));
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 369c615eade..2dc87fffcbc 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
 	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
@@ -341,12 +341,12 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 
 ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
 {
-	float3 L_sum, L_direct, L_indirect;
-	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
-	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
-
+	float3 L_sum;
 	/* Light Passes are used */
 #ifdef __PASSES__
+	float3 L_direct, L_indirect;
+	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
+	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
 	if(L->use_light_pass) {
 		path_radiance_sum_indirect(L);
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 20d7a143c67..2fca83c615f 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -57,7 +57,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 		/* sample subsurface scattering */
 		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+			if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
 				is_sss_sample = true;
 		}
 #endif
@@ -208,7 +208,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		filter_x = filter_y = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_x);
+		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
 	}
 
 	/* subpixel u/v offset */
@@ -259,7 +259,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
-			if ((sd.flag & SD_HAS_BUMP)) {
+			if((sd.flag & SD_HAS_BUMP)) {
 				shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
 			}
 
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index ded222e20ff..3ce5134181a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -16,17 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Workaround for explicit conversion from constant to private memory
- * pointer when using OpenCL.
- *
- * TODO(sergey): Find a real solution for this.
- */
-#ifdef __KERNEL_OPENCL__
-#  define __motion_as_decoupled_const_ptr(motion) ((motion))
-#else
-#  define __motion_as_decoupled_const_ptr(motion) ((const DecompMotionTransform*)(motion))
-#endif
-
 /* Perspective Camera */
 
 ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
@@ -50,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -80,9 +69,16 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
 		transform_motion_interpolate(&cameratoworld,
-		                             __motion_as_decoupled_const_ptr(&kernel_data.cam.motion),
+		                             ((const DecompMotionTransform*)&tfm),
 		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             ((const DecompMotionTransform*)&kernel_data.cam.motion),
+		                             ray->time);
+#endif
 	}
 #endif
 
@@ -112,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 }
 
 /* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -144,9 +139,16 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
 		transform_motion_interpolate(&cameratoworld,
-		                             __motion_as_decoupled_const_ptr(&kernel_data.cam.motion),
+		                             (const DecompMotionTransform*)&tfm,
 		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
 	}
 #endif
 
@@ -172,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -220,10 +222,18 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
 		transform_motion_interpolate(&cameratoworld,
-		                             __motion_as_decoupled_const_ptr(&kernel_data.cam.motion),
+		                             (const DecompMotionTransform*)&tfm,
 		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -245,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 /* Common */
 
 ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-	float lens_u, float lens_v, float time, Ray *ray)
+	float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
 {
 	/* pixel filter */
 	int filter_table_offset = kernel_data.film.filter_table_offset;
@@ -308,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -318,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(sd->object != OBJECT_NONE)
+		if(ccl_fetch(sd, object) != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
@@ -329,7 +339,4 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 	}
 }
 
-#undef __motion_as_decoupled_const_ptr
-
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 200667a0911..0bf1ed36d1e 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -24,6 +24,15 @@
  */
 #if defined(__GNUC__) && defined(NDEBUG)
 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
 #include "util_debug.h"
@@ -32,6 +41,8 @@
 #include "util_half.h"
 #include "util_types.h"
 
+#define ccl_addr_space
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 904736c190c..9fdd3abfec3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -22,6 +22,14 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
 #include <cuda.h>
 #include <float.h>
 
@@ -33,6 +41,7 @@
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
+#define ccl_addr_space
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index d480ec0f270..e8b36d2605d 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -37,6 +37,22 @@
 #define ccl_may_alias
 #define ccl_constant __constant
 #define ccl_global __global
+#define ccl_local __local
+#define ccl_private __private
+
+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
 
 /* no assert in opencl */
 #define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
index f532442ba41..24d6458567e 100644
--- a/intern/cycles/kernel/kernel_debug.h
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -19,11 +19,13 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void debug_data_init(DebugData *debug_data)
 {
 	debug_data->num_bvh_traversal_steps = 0;
+	debug_data->num_bvh_traversed_instances = 0;
+	debug_data->num_ray_bounces = 0;
 }
 
 ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
                                                  ccl_global float *buffer,
-                                                 PathState *state,
+                                                 ccl_addr_space PathState *state,
                                                  DebugData *debug_data,
                                                  int sample)
 {
@@ -33,6 +35,16 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
 		                        sample,
 		                        debug_data->num_bvh_traversal_steps);
 	}
+	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        sample,
+		                        debug_data->num_bvh_traversed_instances);
+	}
+	if(flag & PASS_RAY_BOUNCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        sample,
+		                        debug_data->num_ray_bounces);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index e5fbd5b450e..ae1e70f0167 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
 	/* ray differential transfer through homogeneous medium, to
 	 * compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
 	dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }
 
-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
 {
 	/* compute dIdx/dy at a shading point, we just need to negate the
 	 * differential of the ray direction */
@@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
 	dI->dy = -dD.dy;
 }
 
-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
 	/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
 	 * from the primitive, we can compute dudx/dy and dvdx/dy. these are
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 7523105607f..de9e8d77ec8 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -17,12 +17,20 @@
 CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
-
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	,ShaderData *sd_input
+#endif
+)
 {
 	/* setup shading at emitter */
-	ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+	ShaderData *sd = sd_input;
+#else
+	ShaderData sd_object;
+	ShaderData *sd = &sd_object;
+#endif
 	float3 eval;
 
 #ifdef __BACKGROUND_MIS__
@@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
-		shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
-		eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+		shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+		eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
 	}
 	else
 #endif
 	{
-		shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+		shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
 
-		ls->Ng = sd.Ng;
+		ls->Ng = ccl_fetch(sd, Ng);
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
-		shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 
 		/* evaluate emissive closure */
-		if(sd.flag & SD_EMISSION)
-			eval = shader_emissive_eval(kg, &sd);
+		if(ccl_fetch(sd, flag) & SD_EMISSION)
+			eval = shader_emissive_eval(kg, sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -65,7 +73,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
-	int bounce, int transparent_bounce)
+	int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	, ShaderData *sd_DL
+#endif
+	)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@@ -74,7 +86,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	differential3 dD = differential3_zero();
 
 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+	                                         bounce,
+	                                         transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	                                         ,sd_DL
+#endif
+	                                         );
 
 	if(is_zero(light_eval))
 		return false;
@@ -83,7 +102,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	float bsdf_pdf;
 
 #ifdef __VOLUME__
-	if(sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 	else
 		shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@@ -118,8 +137,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
-		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -132,7 +151,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -154,14 +173,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -172,7 +191,11 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+                                                ,ShaderData *sd
+#endif
+                                                )
 {
 	bool hit_lamp = false;
 
@@ -188,14 +211,21 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		/* use visibility flag to skip lights */
 		if(ls.shader & SHADER_EXCLUDE_ANY) {
 			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_REFLECT)) ||
+			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+			    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 			   ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 				continue;
 		}
 #endif
 
-		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+		                                state->bounce,
+		                                state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+		                                ,sd
+#endif
+		                                );
 
 #ifdef __VOLUME__
 		if(state->volume_stack[0].shader != SHADER_NONE) {
@@ -224,7 +254,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+                                               ,ShaderData *sd_global
+#endif
+                                               )
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;
@@ -232,18 +266,25 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	/* use visibility flag to skip lights */
 	if(shader & SHADER_EXCLUDE_ANY) {
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_REFLECT)) ||
+		   ((shader & SHADER_EXCLUDE_GLOSSY) &&
+		    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
 		   ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+#ifdef __SPLIT_KERNEL__
 	/* evaluate background closure */
+	Ray priv_ray = *ray;
+	shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+	float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
 	ShaderData sd;
 	shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
 
 	float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif
 
 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
@@ -252,7 +293,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	if(!(state->flag & PATH_RAY_MIS_SKIP) && res) {
 		/* multiple importance sampling, get background light pdf for ray
 		 * direction, and compute weight with respect to BSDF pdf */
-		float pdf = background_light_pdf(kg, ray->D);
+		float pdf = background_light_pdf(kg, ray->P, ray->D);
 		float mis_weight = power_heuristic(state->ray_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index 4668b40b86d..f9e9b413898 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -27,7 +27,7 @@ ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 	result.z = color_scene_linear_to_srgb(result.z*exposure);
 
 	/* clamp since alpha might be > 1.0 due to russian roulette */
-	result.w = clamp(result.w, 0.0f, 1.0f);
+	result.w = saturate(result.w);
 
 	return result;
 }
@@ -37,10 +37,10 @@ ccl_device uchar4 film_float_to_byte(float4 color)
 	uchar4 result;
 
 	/* simple float to byte conversion */
-	result.x = (uchar)clamp(color.x*255.0f, 0.0f, 255.0f);
-	result.y = (uchar)clamp(color.y*255.0f, 0.0f, 255.0f);
-	result.z = (uchar)clamp(color.z*255.0f, 0.0f, 255.0f);
-	result.w = (uchar)clamp(color.w*255.0f, 0.0f, 255.0f);
+	result.x = (uchar)(saturate(color.x)*255.0f);
+	result.y = (uchar)(saturate(color.y)*255.0f);
+	result.z = (uchar)(saturate(color.z)*255.0f);
+	result.w = (uchar)(saturate(color.w)*255.0f);
 
 	return result;
 }
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 0a9753baca2..17fa18909c4 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 
 #ifdef __KERNEL_OPENCL__
 
-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
 
 #define KERNEL_TEX(type, ttype, name) \
@@ -94,7 +94,7 @@ typedef struct KernelGlobals {
 
 ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
 {
-	x = clamp(x, 0.0f, 1.0f)*(size-1);
+	x = saturate(x)*(size-1);
 
 	int index = min(float_to_int(x), size-1);
 	int nindex = min(index+1, size-1);
@@ -110,7 +110,7 @@ ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int s
 
 ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
 {
-	y = clamp(y, 0.0f, 1.0f)*(ysize-1);
+	y = saturate(y)*(ysize-1);
 
 	int index = min(float_to_int(y), ysize-1);
 	int nindex = min(index+1, ysize-1);
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 6953f005ea9..9ba41635b9e 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -128,7 +128,7 @@ ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
 			i *= 0xc860a3df;
 			i &= w;
 			i ^= i >> 5;
-		} while (i >= l);
+		} while(i >= l);
 
 		return (i + p) % l;
 	}
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 76fa754b5fa..1badbc3b9f7 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -33,6 +33,98 @@ typedef struct LightSample {
 	LightType type;		/* type of light */
 } LightSample;
 
+/* Area light sampling */
+
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ *
+ * Note: light_p is modified when sample_coord is true.
+ */
+ccl_device float area_light_sample(float3 P,
+                                   float3 *light_p,
+                                   float3 axisu, float3 axisv,
+                                   float randu, float randv,
+                                   bool sample_coord)
+{
+	/* In our name system we're using P for the center,
+	* which is o in the paper.
+	*/
+
+	float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = safe_acosf(-dot(n0, n1));
+	float g1 = safe_acosf(-dot(n1, n2));
+	float g2 = safe_acosf(-dot(n2, n3));
+	float g3 = safe_acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float b0 = n0.z;
+	float b1 = n2.z;
+	float b0sq = b0 * b0;
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+
+	if(sample_coord) {
+		/* Compute cu. */
+		float au = randu * S + k;
+		float fu = (cosf(au) * b0 - b1) / sinf(au);
+		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+		cu = clamp(cu, -1.0f, 1.0f);
+		/* Compute xu. */
+		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		xu = clamp(xu, x0, x1);
+		/* Compute yv. */
+		float z0sq = z0 * z0;
+		float y0sq = y0 * y0;
+		float y1sq = y1 * y1;
+		float d = sqrtf(xu * xu + z0sq);
+		float h0 = y0 / sqrtf(d * d + y0sq);
+		float h1 = y1 / sqrtf(d * d + y1sq);
+		float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+		float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+		/* Transform (xu, yv, z0) to world coords. */
+		*light_p = P + xu * x + yv * y + z0 * z;
+	}
+
+	/* return pdf */
+	if(S != 0.0f)
+		return 1.0f / S;
+	else
+		return 0.0f;
+}
+
 /* Background Light */
 
 #ifdef __BACKGROUND_MIS__
@@ -46,7 +138,7 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
 {
 	/* for the following, the CDF values are actually a pair of floats, with the
 	 * function value as X and the actual CDF as Y.  The last entry's function
@@ -116,10 +208,8 @@ float3 background_light_sample(KernelGlobals *kg, float randu, float randv, floa
 	else
 		*pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
 
-	*pdf *= kernel_data.integrator.pdf_lights;
-
 	/* compute direction */
-	return -equirectangular_to_direction(u, v);
+	return equirectangular_to_direction(u, v);
 }
 
 /* TODO(sergey): Same as above, after the release we should consider using
@@ -130,7 +220,7 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-float background_light_pdf(KernelGlobals *kg, float3 direction)
+float background_map_pdf(KernelGlobals *kg, float3 direction)
 {
 	float2 uv = direction_to_equirectangular(direction);
 	int res = kernel_data.integrator.pdf_background_res;
@@ -156,9 +246,223 @@ float background_light_pdf(KernelGlobals *kg, float3 direction)
 	float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, index_v * (res + 1) + index_u);
 	float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
 
-	float pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+	return (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+}
+
+ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals *kg,
+                                                                   float3 P,
+                                                                   int index,
+                                                                   float3 *lightpos,
+                                                                   float3 *dir)
+{
+	float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0);
+	float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3);
+
+	*lightpos = make_float3(data0.y, data0.z, data0.w);
+	*dir = make_float3(data3.y, data3.z, data3.w);
+
+	/* Check whether portal is on the right side. */
+	if(dot(*dir, P - *lightpos) > 1e-5f)
+		return true;
+
+	return false;
+}
+
+ccl_device float background_portal_pdf(KernelGlobals *kg,
+                                       float3 P,
+                                       float3 direction,
+                                       int ignore_portal,
+                                       bool *is_possible)
+{
+	float portal_pdf = 0.0f;
+
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		if(p == ignore_portal)
+			continue;
+
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(is_possible) {
+			/* There's a portal that could be sampled from this position. */
+			*is_possible = true;
+		}
+
+		float t = -(dot(P, dir) - dot(lightpos, dir)) / dot(direction, dir);
+		if(t <= 1e-5f) {
+			/* Either behind the portal or too close. */
+			continue;
+		}
 
-	return pdf * kernel_data.integrator.pdf_lights;
+		float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+		float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+
+		float3 axisu = make_float3(data1.y, data1.z, data1.w);
+		float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+		float3 hit = P + t*direction;
+		float3 inplane = hit - lightpos;
+		/* Skip if the the ray doesn't pass through portal. */
+		if(fabsf(dot(inplane, axisu) / dot(axisu, axisu)) > 0.5f)
+			continue;
+		if(fabsf(dot(inplane, axisv) / dot(axisv, axisv)) > 0.5f)
+			continue;
+
+		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+	}
+
+	return kernel_data.integrator.num_portals? portal_pdf / kernel_data.integrator.num_portals: 0.0f;
+}
+
+ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+{
+	int num_possible_portals = 0;
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		float3 lightpos, dir;
+		if(background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			num_possible_portals++;
+	}
+	return num_possible_portals;
+}
+
+ccl_device float3 background_portal_sample(KernelGlobals *kg,
+                                           float3 P,
+                                           float randu,
+                                           float randv,
+                                           int num_possible,
+                                           int *sampled_portal,
+                                           float *pdf)
+{
+	/* Pick a portal, then re-normalize randv. */
+	randv *= num_possible;
+	int portal = (int)randv;
+	randv -= portal;
+
+	/* TODO(sergey): Some smarter way of finding portal to sample
+	 * is welcome.
+	 */
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		/* Search for the sampled portal. */
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(portal == 0) {
+			/* p is the portal to be sampled. */
+			float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+			float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+			float3 axisu = make_float3(data1.y, data1.z, data1.w);
+			float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+			*pdf = area_light_sample(P, &lightpos,
+			                         axisu, axisv,
+			                         randu, randv,
+			                         true);
+
+			*pdf /= num_possible;
+			*sampled_portal = p;
+			return normalize(lightpos - P);
+		}
+
+		portal--;
+	}
+
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	/* Check if there are portals in the scene which we can sample. */
+	if(portal_sampling_pdf > 0.0f) {
+		int num_portals = background_num_possible_portals(kg, P);
+		if(num_portals > 0) {
+			if(portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) {
+				if(portal_sampling_pdf < 1.0f) {
+					randu /= portal_sampling_pdf;
+				}
+				int portal;
+				float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
+				if(num_portals > 1) {
+					/* Ignore the chosen portal, its pdf is already included. */
+					*pdf += background_portal_pdf(kg, P, D, portal, NULL);
+				}
+				/* We could also have sampled the map, so combine with MIS. */
+				if(portal_sampling_pdf < 1.0f) {
+					float cdf_pdf = background_map_pdf(kg, D);
+					*pdf = (portal_sampling_pdf * (*pdf)
+					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
+				}
+				return D;
+			} else {
+				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
+				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
+			}
+		} else {
+			/* We can't sample a portal.
+			 * Check if we can sample the map instead.
+			 */
+			if(portal_sampling_pdf == 1.0f) {
+				/* Use uniform as a fallback if we can't sample the map. */
+				*pdf = 1.0f / M_4PI_F;
+				return sample_uniform_sphere(randu, randv);
+			}
+			else {
+				portal_sampling_pdf = 0.0f;
+			}
+		}
+	}
+
+	float3 D = background_map_sample(kg, randu, randv, pdf);
+	/* Use MIS if portals could be sampled as well. */
+	if(portal_sampling_pdf > 0.0f) {
+		float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL);
+		*pdf = (portal_sampling_pdf * portal_pdf
+		     + (1.0f - portal_sampling_pdf) * (*pdf));
+	}
+	return D;
+}
+
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	if(portal_sampling_pdf > 0.0f) {
+		bool is_possible = false;
+		float portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible);
+		if(portal_pdf == 0.0f) {
+			if(portal_sampling_pdf == 1.0f) {
+				/* If there are no possible portals at this point,
+				 * the fallback sampling would have been used.
+				 * Otherwise, the direction would not be sampled at all => pdf = 0
+				 */
+				return is_possible? 0.0f: kernel_data.integrator.pdf_lights / M_4PI_F;
+			}
+			else {
+				/* We can only sample the map. */
+				return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
+			}
+		} else {
+			if(portal_sampling_pdf == 1.0f) {
+				/* We can only sample portals. */
+				return portal_pdf * kernel_data.integrator.pdf_lights;
+			}
+			else {
+				/* We can sample both, so combine with MIS. */
+				return (background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf)
+				      + portal_pdf * portal_sampling_pdf) * kernel_data.integrator.pdf_lights;
+			}
+		}
+	}
+
+	/* No portals in the scene, so must sample the map.
+	 * At least one of them is always possible if we have a LIGHT_BACKGROUND.
+	 */
+	return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
 }
 #endif
 
@@ -184,96 +488,6 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-/* Uses the following paper:
- *
- * Carlos Urena et al.
- * An Area-Preserving Parametrization for Spherical Rectangles.
- *
- * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
- *
- * Note: light_p is modified when sample_coord is true.
- */
-ccl_device float area_light_sample(float3 P,
-                                   float3 *light_p,
-                                   float3 axisu, float3 axisv,
-                                   float randu, float randv,
-                                   bool sample_coord)
-{
-	/* In our name system we're using P for the center,
-	 * which is o in the paper.
-	 */
-
-	float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
-	float axisu_len, axisv_len;
-	/* Compute local reference system R. */
-	float3 x = normalize_len(axisu, &axisu_len);
-	float3 y = normalize_len(axisv, &axisv_len);
-	float3 z = cross(x, y);
-	/* Compute rectangle coords in local reference system. */
-	float3 dir = corner - P;
-	float z0 = dot(dir, z);
-	/* Flip 'z' to make it point against Q. */
-	if(z0 > 0.0f) {
-		z *= -1.0f;
-		z0 *= -1.0f;
-	}
-	float x0 = dot(dir, x);
-	float y0 = dot(dir, y);
-	float x1 = x0 + axisu_len;
-	float y1 = y0 + axisv_len;
-	/* Create vectors to four vertices. */
-	float3 v00 = make_float3(x0, y0, z0);
-	float3 v01 = make_float3(x0, y1, z0);
-	float3 v10 = make_float3(x1, y0, z0);
-	float3 v11 = make_float3(x1, y1, z0);
-	/* Compute normals to edges. */
-	float3 n0 = normalize(cross(v00, v10));
-	float3 n1 = normalize(cross(v10, v11));
-	float3 n2 = normalize(cross(v11, v01));
-	float3 n3 = normalize(cross(v01, v00));
-	/* Compute internal angles (gamma_i). */
-	float g0 = safe_acosf(-dot(n0, n1));
-	float g1 = safe_acosf(-dot(n1, n2));
-	float g2 = safe_acosf(-dot(n2, n3));
-	float g3 = safe_acosf(-dot(n3, n0));
-	/* Compute predefined constants. */
-	float b0 = n0.z;
-	float b1 = n2.z;
-	float b0sq = b0 * b0;
-	float k = M_2PI_F - g2 - g3;
-	/* Compute solid angle from internal angles. */
-	float S = g0 + g1 - k;
-
-	if(sample_coord) {
-		/* Compute cu. */
-		float au = randu * S + k;
-		float fu = (cosf(au) * b0 - b1) / sinf(au);
-		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
-		cu = clamp(cu, -1.0f, 1.0f);
-		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
-		xu = clamp(xu, x0, x1);
-		/* Compute yv. */
-		float z0sq = z0 * z0;
-		float y0sq = y0 * y0;
-		float y1sq = y1 * y1;
-		float d = sqrtf(xu * xu + z0sq);
-		float h0 = y0 / sqrtf(d * d + y0sq);
-		float h1 = y1 / sqrtf(d * d + y1sq);
-		float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
-		float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
-
-		/* Transform (xu, yv, z0) to world coords. */
-		*light_p = P + xu * x + yv * y + z0 * z;
-	}
-
-	/* return pdf */
-	if(S != 0.0f)
-		return 1.0f / S;
-	else
-		return 0.0f;
-}
-
 ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
 {
 	float3 dir = make_float3(data2.y, data2.z, data2.w);
@@ -344,13 +558,14 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
 		/* infinite area light (e.g. light dome or env light) */
-		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
+		float3 D = -background_light_sample(kg, P, randu, randv, &ls->pdf);
 
 		ls->P = D;
 		ls->Ng = D;
 		ls->D = -D;
 		ls->t = FLT_MAX;
 		ls->eval_fac = 1.0f;
+		ls->pdf *= kernel_data.integrator.pdf_lights;
 	}
 #endif
 	else {
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 6bb39ee485d..20cf3fa931b 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	atomic_add_float(buf, value);
+#else
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+#else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+	ccl_global float *buf_w = buffer + 3;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+	atomic_add_float(buf_w, value.w);
+#else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, PathState *state, float3 throughput)
+	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(sd->flag & SD_TRANSPARENT) ||
+		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, sd->P);
+					float depth = camera_distance(kg, ccl_fetch(sd, P));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, sd->object);
+					float id = object_pass_id(kg, ccl_fetch(sd, object));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = sd->N;
+				float3 normal = ccl_fetch(sd, N);
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -101,8 +127,8 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, sd->P);
-		float mist = clamp((depth - mist_start)*mist_inv_depth, 0.0f, 1.0f);
+		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
 		float mist_falloff = kernel_data.film.mist_falloff;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 9b9495644dd..9794ad1d180 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -42,6 +42,7 @@
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
 #include "kernel_emission.h"
+#include "kernel_path_common.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"
 
@@ -273,8 +274,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				float bssrdf_u, bssrdf_v;
 				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-
-				state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			}
 		}
 #endif
@@ -307,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(sd->P, sd->Ng);
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+		light_ray.time = ccl_fetch(sd, time);
 #endif
-		light_ray.dP = sd->dP;
+		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@@ -325,70 +324,8 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 	}
 }
 
-ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
-{
-	int num_samples = kernel_data.integrator.ao_samples;
-	float num_samples_inv = 1.0f/num_samples;
-	float ao_factor = kernel_data.background.ao_factor;
-	float3 ao_N;
-	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-	for(int j = 0; j < num_samples; j++) {
-		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-		float3 ao_D;
-		float ao_pdf;
-
-		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-			Ray light_ray;
-			float3 ao_shadow;
-
-			light_ray.P = ray_offset(sd->P, sd->Ng);
-			light_ray.D = ao_D;
-			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
-#endif
-			light_ray.dP = sd->dP;
-			light_ray.dD = differential3_zero();
-
-			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
-		}
-	}
-}
-
 #ifdef __SUBSURFACE__
 
-#ifdef __VOLUME__
-ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg,
-                                                           Ray *ray,
-                                                           VolumeStack *stack)
-{
-	kernel_assert(kernel_data.integrator.use_volumes);
-
-	Ray volume_ray = *ray;
-	Intersection isect;
-	int step = 0;
-	while(step < VOLUME_STACK_SIZE &&
-	      scene_intersect_volume(kg, &volume_ray, &isect))
-	{
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		kernel_volume_stack_enter_exit(kg, &sd, stack);
-
-		/* Move ray forward. */
-		volume_ray.P = ray_offset(sd.P, -sd.Ng);
-		volume_ray.t -= sd.ray_length;
-		++step;
-	}
-}
-#endif
-
 ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
 {
 	float bssrdf_probability;
@@ -408,7 +345,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 #ifdef __VOLUME__
 		Ray volume_ray = *ray;
 		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-		                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+		                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
 		/* compute lighting with the BSDF closure */
@@ -417,7 +354,6 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 			PathState hit_state = *state;
 			Ray hit_ray = *ray;
 
-			hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			hit_state.rng_offset += PRNG_BOUNCE_NUM;
 			
 			kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
@@ -433,7 +369,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
 					                             &volume_ray.t);
 
-					kernel_path_subsurface_update_volume_stack(
+					kernel_volume_stack_update_for_subsurface(
 					    kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
@@ -503,7 +439,9 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 #ifdef __KERNEL_DEBUG__
 		if(state.flag & PATH_RAY_CAMERA) {
 			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
 		}
+		debug_data.num_ray_bounces++;
 #endif
 
 #ifdef __LAMP_MIS__
@@ -733,474 +671,6 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
-#ifdef __BRANCHED_PATH__
-
-/* branched path tracing: bounce off surface and integrate indirect light */
-ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
-	PathState *state, PathRadiance *L)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
-		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-			continue;
-
-		int num_samples;
-
-		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-			num_samples = kernel_data.integrator.diffuse_samples;
-		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
-			num_samples = 1;
-		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
-			num_samples = kernel_data.integrator.glossy_samples;
-		else
-			num_samples = kernel_data.integrator.transmission_samples;
-
-		num_samples = ceil_to_int(num_samples_adjust*num_samples);
-
-		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
-
-		for(int j = 0; j < num_samples; j++) {
-			PathState ps = *state;
-			float3 tp = throughput;
-			Ray bsdf_ray;
-
-			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
-				continue;
-
-			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
-
-			/* for render passes, sum and reset indirect light pass variables
-			 * for the next samples */
-			path_radiance_sum_indirect(L);
-			path_radiance_reset_indirect(L);
-		}
-	}
-}
-
-#ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        RNG *rng,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSSRDF(sc->type))
-			continue;
-
-		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-		int num_samples = kernel_data.integrator.subsurface_samples;
-		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
-
-		state->flag |= PATH_RAY_BSSRDF_ANCESTOR;
-
-		/* do subsurface scatter step with copy of shader data, this will
-		 * replace the BSSRDF with a diffuse BSDF closure */
-		for(int j = 0; j < num_samples; j++) {
-			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-#ifdef __VOLUME__
-			Ray volume_ray = *ray;
-			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-			                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
-#endif
-
-			/* compute lighting with the BSDF closure */
-			for(int hit = 0; hit < num_hits; hit++) {
-				PathState hit_state = *state;
-
-				path_state_branch(&hit_state, j, num_samples);
-
-#ifdef __VOLUME__
-				if(need_update_volume_stack) {
-					/* Setup ray from previous surface point to the new one. */
-					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
-					volume_ray.D = normalize_len(P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_path_subsurface_update_volume_stack(
-					    kg,
-					    &volume_ray,
-					    hit_state.volume_stack);
-
-					/* Move volume ray forward. */
-					volume_ray.P = P;
-				}
-#endif
-
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
-				/* direct light */
-				if(kernel_data.integrator.use_direct_light) {
-					bool all = kernel_data.integrator.sample_all_lights_direct;
-					kernel_branched_path_surface_connect_light(kg, rng,
-						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
-				}
-#endif
-
-				/* indirect light */
-				kernel_branched_path_surface_indirect_light(kg, rng,
-					&bssrdf_sd[hit], throughput, num_samples_inv,
-					&hit_state, L);
-			}
-		}
-
-		state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
-	}
-}
-#endif
-
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
-{
-	/* initialize */
-	PathRadiance L;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
-
-	PathState state;
-	path_state_init(kg, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif
-
-	for(;;) {
-		/* intersect scene */
-		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
-
-#ifdef __KERNEL_DEBUG__
-		if(state.flag & PATH_RAY_CAMERA) {
-			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
-		}
-#endif
-
-#ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-			ShaderData volume_sd;
-
-			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-					throughput, &state, &L, all, &volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-						
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				ShaderData volume_sd;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
-				
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
-#endif
-		}
-#endif
-
-		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
-
-			break;
-		}
-
-		/* setup shading */
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
-		shader_merge_closures(&sd);
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
-		}
-#endif
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
-		}
-#endif
-
-		/* transparency termination */
-		if(state.flag & PATH_RAY_TRANSPARENT) {
-			/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
-
-			if(probability == 0.0f) {
-				break;
-			}
-			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
-
-				if(terminate >= probability)
-					break;
-
-				throughput /= probability;
-			}
-		}
-
-#ifdef __AO__
-		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
-		}
-#endif
-
-#ifdef __SUBSURFACE__
-		/* bssrdf scatter to a different location on the same object */
-		if(sd.flag & SD_BSSRDF) {
-			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
-			                                        rng, &ray, throughput);
-		}
-#endif
-
-		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-			PathState hit_state = state;
-
-#ifdef __EMISSION__
-			/* direct light */
-			if(kernel_data.integrator.use_direct_light) {
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &hit_state, throughput, 1.0f, &L, all);
-			}
-#endif
-
-			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, throughput, 1.0f, &hit_state, &L);
-
-			/* continue in case of transparency */
-			throughput *= shader_bsdf_transparency(kg, &sd);
-
-			if(is_zero(throughput))
-				break;
-		}
-
-		path_state_next(kg, &state, LABEL_TRANSPARENT);
-		ray.P = ray_offset(sd.P, -sd.Ng);
-		ray.t -= sd.ray_length; /* clipping works through transparent */
-
-
-#ifdef __RAY_DIFFERENTIALS__
-		ray.dP = sd.dP;
-		ray.dD.dx = -sd.dI.dx;
-		ray.dD.dy = -sd.dI.dy;
-#endif
-
-#ifdef __VOLUME__
-		/* enter/exit volume */
-		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
-	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
-}
-
-#endif
-
-ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
-{
-	float filter_u;
-	float filter_v;
-
-	int num_samples = kernel_data.integrator.aa_samples;
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
-
-	/* sample camera ray */
-
-	float lens_u = 0.0f, lens_v = 0.0f;
-
-	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-	float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
-#endif
-
-	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
 ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
@@ -1232,38 +702,5 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
-#ifdef __BRANCHED_PATH__
-ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
-	int sample, int x, int y, int offset, int stride)
-{
-	/* buffer offset */
-	int index = offset + x + y*stride;
-	int pass_stride = kernel_data.film.pass_stride;
-
-	rng_state += index;
-	buffer += index*pass_stride;
-
-	/* initialize random numbers and ray */
-	RNG rng;
-	Ray ray;
-
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
-
-	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
-
-	path_rng_end(kg, rng_state, rng);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
new file mode 100644
index 00000000000..b6d64985f6a
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+	int num_samples = kernel_data.integrator.ao_samples;
+	float num_samples_inv = 1.0f/num_samples;
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+	for(int j = 0; j < num_samples; j++) {
+		float bsdf_u, bsdf_v;
+		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+		float3 ao_D;
+		float ao_pdf;
+
+		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			Ray light_ray;
+			float3 ao_shadow;
+
+			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.D = ao_D;
+			light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dD = differential3_zero();
+
+			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
+	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
+	PathState *state, PathRadiance *L)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(*rng, i);
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			float3 tp = throughput;
+			Ray bsdf_ray;
+
+			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
+				continue;
+
+			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+
+			/* for render passes, sum and reset indirect light pass variables
+			 * for the next samples */
+			path_radiance_sum_indirect(L);
+			path_radiance_reset_indirect(L);
+		}
+	}
+}
+
+#ifdef __SUBSURFACE__
+ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        PathRadiance *L,
+                                                        PathState *state,
+                                                        RNG *rng,
+                                                        Ray *ray,
+                                                        float3 throughput)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(*rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = 0; j < num_samples; j++) {
+			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
+#ifdef __VOLUME__
+			Ray volume_ray = *ray;
+			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+#endif
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = 0; hit < num_hits; hit++) {
+				PathState hit_state = *state;
+
+				path_state_branch(&hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    &volume_ray,
+					    hit_state.volume_stack);
+
+					/* Move volume ray forward. */
+					volume_ray.P = P;
+				}
+#endif
+
+#ifdef __EMISSION__
+				/* direct light */
+				if(kernel_data.integrator.use_direct_light) {
+					bool all = kernel_data.integrator.sample_all_lights_direct;
+					kernel_branched_path_surface_connect_light(kg, rng,
+						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+				}
+#endif
+
+				/* indirect light */
+				kernel_branched_path_surface_indirect_light(kg, rng,
+					&bssrdf_sd[hit], throughput, num_samples_inv,
+					&hit_state, L);
+			}
+		}
+	}
+}
+#endif
+
+ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+{
+	/* initialize */
+	PathRadiance L;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	float L_transparent = 0.0f;
+
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+	PathState state;
+	path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
+
+	/* Main Loop
+	 * Here we only handle transparency intersections from the camera ray.
+	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
+	 */
+	for(;;) {
+		/* intersect scene */
+		Intersection isect;
+		uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+		float difl = 0.0f, extmax = 0.0f;
+		uint lcg_state = 0;
+
+		if(kernel_data.bvh.have_curves) {
+			if(kernel_data.cam.resolution == 1) {
+				float3 pixdiff = ray.dD.dx + ray.dD.dy;
+				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+			}
+
+			extmax = kernel_data.curve.maximum_width;
+			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+		}
+
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+#else
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data.num_ray_bounces++;
+#endif
+
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+			
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			/* decoupled ray marching only supported on CPU */
+
+			/* cache steps along volume for repeated sampling */
+			VolumeSegment volume_segment;
+			ShaderData volume_sd;
+
+			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+			kernel_volume_decoupled_record(kg, &state,
+				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+			/* direct light sampling */
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+
+				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					throughput, &state, &L, all, &volume_ray, &volume_segment);
+
+				/* indirect light sampling */
+				int num_samples = kernel_data.integrator.volume_samples;
+				float num_samples_inv = 1.0f/num_samples;
+
+				for(int j = 0; j < num_samples; j++) {
+					/* workaround to fix correlation bug in T38710, can find better solution
+					 * in random number generator later, for now this is done here to not impact
+					 * performance of rendering without volumes */
+					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+					PathState ps = state;
+					Ray pray = ray;
+					float3 tp = throughput;
+
+					/* branch RNG state */
+					path_state_branch(&ps, j, num_samples);
+
+					/* scatter sample. if we use distance sampling and take just one
+					 * sample for direct and indirect light, we could share this
+					 * computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						
+					(void)result;
+					kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+			}
+
+			/* emission and transmittance */
+			if(volume_segment.closure_flag & SD_EMISSION)
+				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+			throughput *= volume_segment.accum_transmittance;
+
+			/* free cached steps */
+			kernel_volume_decoupled_free(kg, &volume_segment);
+#else
+			/* GPU: no decoupled ray marching, scatter probalistically */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			/* todo: we should cache the shader evaluations from stepping
+			 * through the volume, for now we redo them multiple times */
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = state;
+				Ray pray = ray;
+				ShaderData volume_sd;
+				float3 tp = throughput * num_samples_inv;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
+				
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* todo: support equiangular, MIS and all light sampling.
+					 * alternatively get decoupled ray marching working on the GPU */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+#endif
+			}
+
+			/* todo: avoid this calculation using decoupled ray marching */
+			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
+#endif
+		}
+#endif
+
+		if(!hit) {
+			/* eval background shader if nothing hit */
+			if(kernel_data.background.transparent) {
+				L_transparent += average(throughput);
+
+#ifdef __PASSES__
+				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+					break;
+			}
+
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, &state, &ray);
+			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+#endif
+
+			break;
+		}
+
+		/* setup shading */
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
+		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_merge_closures(&sd);
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+				
+				if(sd.flag & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, &sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent += average(holdout_weight*throughput);
+			}
+
+			if(sd.flag & SD_HOLDOUT_MASK)
+				break;
+		}
+#endif
+
+		/* holdout mask objects do not write data passes */
+		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+
+#ifdef __EMISSION__
+		/* emission */
+		if(sd.flag & SD_EMISSION) {
+			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
+			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		}
+#endif
+
+		/* transparency termination */
+		if(state.flag & PATH_RAY_TRANSPARENT) {
+			/* path termination. this is a strange place to put the termination, it's
+			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+			 * shader evaluations, only need emission if we are going to terminate */
+			float probability = path_state_terminate_probability(kg, &state, throughput);
+
+			if(probability == 0.0f) {
+				break;
+			}
+			else if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+
+				if(terminate >= probability)
+					break;
+
+				throughput /= probability;
+			}
+		}
+
+#ifdef __AO__
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
+			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
+		}
+#endif
+
+#ifdef __SUBSURFACE__
+		/* bssrdf scatter to a different location on the same object */
+		if(sd.flag & SD_BSSRDF) {
+			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
+			                                        rng, &ray, throughput);
+		}
+#endif
+
+		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+			PathState hit_state = state;
+
+#ifdef __EMISSION__
+			/* direct light */
+			if(kernel_data.integrator.use_direct_light) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_surface_connect_light(kg, rng,
+					&sd, &hit_state, throughput, 1.0f, &L, all);
+			}
+#endif
+
+			/* indirect light */
+			kernel_branched_path_surface_indirect_light(kg, rng,
+				&sd, throughput, 1.0f, &hit_state, &L);
+
+			/* continue in case of transparency */
+			throughput *= shader_bsdf_transparency(kg, &sd);
+
+			if(is_zero(throughput))
+				break;
+		}
+
+		/* Update Path State */
+		state.flag |= PATH_RAY_TRANSPARENT;
+		state.transparent_bounce++;
+
+		ray.P = ray_offset(sd.P, -sd.Ng);
+		ray.t -= sd.ray_length; /* clipping works through transparent */
+
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray.dP = sd.dP;
+		ray.dD.dx = -sd.dI.dx;
+		ray.dD.dy = -sd.dI.dy;
+#endif
+
+#ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
+#endif
+	}
+
+	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+
+	kernel_write_light_passes(kg, buffer, &L, sample);
+
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
+	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+}
+
+ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
+	ccl_global float *buffer, ccl_global uint *rng_state,
+	int sample, int x, int y, int offset, int stride)
+{
+	/* buffer offset */
+	int index = offset + x + y*stride;
+	int pass_stride = kernel_data.film.pass_stride;
+
+	rng_state += index;
+	buffer += index*pass_stride;
+
+	/* initialize random numbers and ray */
+	RNG rng;
+	Ray ray;
+
+	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+
+	/* integrate */
+	float4 L;
+
+	if(ray.t != 0.0f)
+		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
+	else
+		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+	/* accumulate result in output buffer */
+	kernel_write_pass_float4(buffer, sample, L);
+
+	path_rng_end(kg, rng_state, rng);
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
new file mode 100644
index 00000000000..1912dfa16ed
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
+                                               ccl_global uint *rng_state,
+                                               int sample,
+                                               int x, int y,
+                                               ccl_addr_space RNG *rng,
+                                               ccl_addr_space Ray *ray)
+{
+	float filter_u;
+	float filter_v;
+
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+
+	/* sample camera ray */
+
+	float lens_u = 0.0f, lens_v = 0.0f;
+
+	if(kernel_data.cam.aperturesize > 0.0f)
+		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+
+	float time = 0.0f;
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.shuttertime != -1.0f)
+		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+#endif
+
+	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index ab146c72cd0..15efb2371de 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
@@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
 {
 	/* ray through transparent keeps same flags from previous ray and is
 	 * not counted as a regular bounce, transparent has separate max */
@@ -106,7 +106,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 			state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else if(label & LABEL_GLOSSY) {
-			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR;
+			state->flag |= PATH_RAY_GLOSSY;
 			state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else {
@@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
 		/* transparent rays treated separately */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index f0d4e98c5e0..fe85a6b6e4b 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -32,7 +32,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	if(sample_all_lights) {
@@ -53,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -85,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -106,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -149,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 	ray->D = bsdf_omega_in;
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = sd->dP;
+	ray->dP = ccl_fetch(sd, dP);
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = sd->time;
+	ray->time = ccl_fetch(sd, time);
 #endif
 
 #ifdef __VOLUME__
@@ -181,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
+#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
 		return;
 
 	/* sample illumination from lights to find path contribution */
@@ -199,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
@@ -216,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	}
 #endif
 }
+#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(sd->flag & SD_BSDF) {
+	if(ccl_fetch(sd, flag) & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -254,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 		ray->D = bsdf_omega_in;
 
 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -275,21 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index bd18fd21354..62922df3286 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -163,6 +163,10 @@ ccl_device float3 mirrorball_to_direction(float u, float v)
 
 	dir.x = 2.0f*u - 1.0f;
 	dir.z = 2.0f*v - 1.0f;
+
+	if(dir.x*dir.x + dir.z*dir.z > 1.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
 	dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
 
 	/* reflection */
@@ -191,6 +195,8 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
 			return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return mirrorball_to_direction(u, v);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
@@ -205,6 +211,8 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
 			return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return direction_to_mirrorball(dir);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
new file mode 100644
index 00000000000..9e65e2b0768
--- /dev/null
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_QUEUE_H__
+#define __KERNEL_QUEUE_H__
+
+/*
+ * Queue utility functions for split kernel
+ */
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+/*
+ * Enqueue ray index into the queue
+ */
+ccl_device void enqueue_ray_index (
+              int ray_index,               /* Ray index to be enqueued */
+              int queue_number,            /* Queue in which the ray index should be enqueued*/
+              ccl_global int *queues,      /* Buffer of all queues */
+              int queue_size,              /* Size of each queue */
+              ccl_global int *queue_index /* Array of size num_queues; Used for atomic increment */
+              )
+{
+	/* This thread's queue index */
+	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	queues[my_queue_index] = ray_index;
+}
+
+/*
+ * Get the ray index for this thread
+ * Returns a positive ray_index for threads that have to do some work;
+ * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
+ * i.e All ray's in the queue has been successfully allocated and there
+ * is no more ray to allocate to other threads.
+ */
+ccl_device int get_ray_index (
+              int thread_index,       /* Global thread index */
+              int queue_number,       /* Queue to operate on */
+              ccl_global int *queues, /* Buffer of all queues */
+              int queuesize,          /* Size of a queue */
+              int empty_queue         /* Empty the queue slot as soon as we fetch the ray index */
+              )
+{
+	int ray_index = queues[queue_number * queuesize + thread_index];
+
+	if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
+		queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	return ray_index;
+}
+
+/* The following functions are to realize Local memory variant of enqueue ray index function */
+
+/* All threads should call this function */
+ccl_device void enqueue_ray_index_local(
+              int ray_index,                               /* Ray index to enqueue*/
+              int queue_number,                            /* Queue in which to enqueue ray index */
+              char enqueue_flag,                           /* True for threads whose ray index has to be enqueued */
+              int queuesize,                               /* queue size */
+              ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics */
+              ccl_global int *Queue_data,                  /* Queues */
+              ccl_global int *Queue_index                  /* To do global queue atomics */
+              )
+{
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	/* Get local queue id */
+	unsigned int lqidx;
+	if(enqueue_flag) {
+		lqidx = atomic_inc(local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue offset */
+	if(lidx == 0) {
+		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue index and enqueue ray */
+	if(enqueue_flag) {
+		unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
+
+ccl_device unsigned int get_local_queue_index(
+               int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
+               ccl_local unsigned int *local_queue_atomics
+               )
+{
+	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	return my_lqidx;
+}
+
+ccl_device unsigned int get_global_per_queue_offset(
+               int queue_number,
+               ccl_local unsigned int *local_queue_atomics,
+               ccl_global int* global_queue_atomics
+               )
+{
+	unsigned int queue_offset = atomic_add((&global_queue_atomics[queue_number]), local_queue_atomics[queue_number]);
+	return queue_offset;
+}
+
+ccl_device unsigned int get_global_queue_index(
+               int queue_number,
+               int queuesize,
+               unsigned int lqidx,
+               ccl_local unsigned int * global_per_queue_offset
+               )
+{
+	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
+	return my_gqidx;
+}
+
+#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 40767bac013..631a2cb75de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
 #endif
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 0f3b09a9555..94e13028599 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -37,13 +37,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(sd->flag & SD_OBJECT_MOTION) {
-		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
+	if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) {
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
+		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
 	}
 	else {
-		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
 {
 #ifdef __INSTANCING__
-	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	sd->type = isect->type;
-	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, type) = isect->type;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
 
-	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-	sd->ray_length = isect->t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+	ccl_fetch(sd, ray_length) = isect->t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __UV__
-	sd->u = isect->u;
-	sd->v = isect->v;
+	ccl_fetch(sd, u) = isect->u;
+	ccl_fetch(sd, v) = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
 
-		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
+		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 		/* vectors */
-		sd->P = triangle_refine(kg, sd, isect, ray);
-		sd->Ng = Ng;
-		sd->N = Ng;
+		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, Ng) = Ng;
+		ccl_fetch(sd, N) = Ng;
 		
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 #endif
 	}
 	else {
@@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	sd->I = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
 
-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
 #ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 #endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 	if(backfacing) {
-		sd->flag |= SD_BACKFACING;
-		sd->Ng = -sd->Ng;
-		sd->N = -sd->N;
+		ccl_fetch(sd, flag) |= SD_BACKFACING;
+		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-		sd->dPdu = -sd->dPdu;
-		sd->dPdv = -sd->dPdv;
+		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-	differential_incoming(&sd->dI, ray->dD);
-	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
+	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
+	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
 #endif
 }
 
@@ -166,7 +166,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 	/* fetch triangle data */
 	if(sd->type == PRIMITIVE_TRIANGLE) {
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* static triangle */
 		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
@@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = P;
-	sd->N = Ng;
-	sd->Ng = Ng;
-	sd->I = I;
-	sd->shader = shader;
-	sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	ccl_fetch(sd, P) = P;
+	ccl_fetch(sd, N) = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, I) = I;
+	ccl_fetch(sd, shader) = shader;
+	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	sd->object = object;
+	ccl_fetch(sd, object) = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	sd->prim = prim;
+	ccl_fetch(sd, prim) = prim;
 #ifdef __UV__
-	sd->u = u;
-	sd->v = v;
+	ccl_fetch(sd, u) = u;
+	ccl_fetch(sd, v) = v;
 #endif
-	sd->ray_length = t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 	/* detect instancing, for non-instanced the object index is -object-1 */
 #ifdef __INSTANCING__
 	bool instanced = false;
 
-	if(sd->prim != PRIM_NONE) {
-		if(sd->object >= 0)
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		if(ccl_fetch(sd, object) >= 0)
 			instanced = true;
 		else
 #endif
-			sd->object = ~sd->object;
+			ccl_fetch(sd, object) = ~ccl_fetch(sd, object);
 #ifdef __INSTANCING__
 	}
 #endif
 
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
-	if(sd->object != OBJECT_NONE) {
-		sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
 	}
 
-	sd->time = time;
+	ccl_fetch(sd, time) = time;
 #else
 	}
 #endif
 
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __INSTANCING__
 			if(instanced)
-				object_normal_transform(kg, sd, &sd->N);
+				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 
 #ifdef __INSTANCING__
 		if(instanced) {
-			object_dir_transform(kg, sd, &sd->dPdu);
-			object_dir_transform(kg, sd, &sd->dPdv);
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 		}
 #endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(sd->prim != PRIM_NONE) {
-		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 		if(backfacing) {
-			sd->flag |= SD_BACKFACING;
-			sd->Ng = -sd->Ng;
-			sd->N = -sd->N;
+			ccl_fetch(sd, flag) |= SD_BACKFACING;
+			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-			sd->dPdu = -sd->dPdu;
-			sd->dPdv = -sd->dPdv;
+			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	sd->dP = differential3_zero();
-	sd->dI = differential3_zero();
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = differential3_zero();
+	ccl_fetch(sd, dI) = differential3_zero();
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
@@ -355,45 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = ray->D;
-	sd->N = -ray->D;
-	sd->Ng = -ray->D;
-	sd->I = -ray->D;
-	sd->shader = kernel_data.background.surface_shader;
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, P) = ray->D;
+	ccl_fetch(sd, N) = -ray->D;
+	ccl_fetch(sd, Ng) = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
+	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 #ifdef __OBJECT_MOTION__
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
-	sd->ray_length = 0.0f;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = 0.0f;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __INSTANCING__
-	sd->object = PRIM_NONE;
+	ccl_fetch(sd, object) = PRIM_NONE;
 #endif
-	sd->prim = PRIM_NONE;
+	ccl_fetch(sd, prim) = PRIM_NONE;
 #ifdef __UV__
-	sd->u = 0.0f;
-	sd->v = 0.0f;
+	ccl_fetch(sd, u) = 0.0f;
+	ccl_fetch(sd, v) = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	sd->dP = ray->dD;
-	differential_incoming(&sd->dI, sd->dP);
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = ray->dD;
+	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
 /* ShaderData setup from point inside volume */
 
+#ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
@@ -439,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
 }
+#endif
 
 /* Merging */
 
@@ -478,6 +480,7 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 			}
 
 			sd->num_closure--;
+			kernel_assert(sd->num_closure >= 0);
 			j--;
 		}
 	}
@@ -491,11 +494,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = &sd->closure[i];
+		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
@@ -513,7 +516,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
 }
 
-ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
@@ -527,22 +530,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 {
 	int sampled = 0;
 
-	if(sd->num_closure > 1) {
+	if(ccl_fetch(sd, num_closure) > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
+		float r = ccl_fetch(sd, randb_closure)*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -552,13 +555,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 			}
 		}
 
-		if(sampled == sd->num_closure) {
+		if(sampled == ccl_fetch(sd, num_closure)) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = &sd->closure[sampled];
+	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+
 	int label;
 	float3 eval;
 
@@ -568,7 +572,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(sd->num_closure > 1) {
+		if(ccl_fetch(sd, num_closure) > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -595,8 +599,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -605,13 +609,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(sd->flag & SD_HAS_ONLY_VOLUME)
+	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -634,8 +638,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -648,8 +652,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -662,8 +666,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -676,8 +680,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -691,8 +695,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			eval += sc->weight*ao_factor;
@@ -700,12 +704,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += sd->N*average(sc->weight);
+			N += ccl_fetch(sd, N)*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = sd->N;
+		N = ccl_fetch(sd, N);
 	else
 		N = normalize(N);
 
@@ -719,8 +723,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			float avg_weight = fabsf(average(sc->weight));
@@ -733,7 +737,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? sd->N: normalize(N);
+		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
 
 	if(texture_blur_)
 		*texture_blur_ = texture_blur/weight_sum;
@@ -745,7 +749,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(sd->Ng, sd->I);
+	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -753,8 +757,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -769,8 +773,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -784,8 +788,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = randb;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -796,9 +800,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
-		sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
-		sd->closure->N = sd->N;
-		sd->flag |= bsdf_diffuse_setup(&sd->closure);
+		ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
+		ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
+		ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
+		ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
+		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
 #endif
 	}
 }
@@ -807,8 +813,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 #ifdef __OSL__
 	if(kg->osl) {
@@ -823,8 +829,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 
 		float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-		for(int i = 0; i< sd->num_closure; i++) {
-			const ShaderClosure *sc = &sd->closure[i];
+		for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 			if(CLOSURE_IS_BACKGROUND(sc->type))
 				eval += sc->weight;
@@ -844,7 +850,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, const float3 omega_in, float *pdf,
 	int skip_phase, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_phase)
 			continue;
 
@@ -997,8 +1003,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h
new file mode 100644
index 00000000000..b157b82e023
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shaderdata_vars.h
@@ -0,0 +1,99 @@
+/*
+* Copyright 2011-2015 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SD_VAR
+#define SD_VAR(type, what)
+#endif
+#ifndef SD_CLOSURE_VAR
+#define SD_CLOSURE_VAR(type, what, max_closure)
+#endif
+
+/* position */
+SD_VAR(float3, P)
+/* smooth normal for shading */
+SD_VAR(float3, N)
+/* true geometric normal */
+SD_VAR(float3, Ng)
+/* view/incoming direction */
+SD_VAR(float3, I)
+/* shader id */
+SD_VAR(int, shader)
+/* booleans describing shader, see ShaderDataFlag */
+SD_VAR(int, flag)
+
+/* primitive id if there is one, ~0 otherwise */
+SD_VAR(int, prim)
+
+/* combined type and curve segment for hair */
+SD_VAR(int, type)
+
+/* parametric coordinates
+* - barycentric weights for triangles */
+SD_VAR(float, u)
+SD_VAR(float, v)
+/* object id if there is one, ~0 otherwise */
+SD_VAR(int, object)
+
+/* motion blur sample time */
+SD_VAR(float, time)
+
+/* length of the ray being shaded */
+SD_VAR(float, ray_length)
+
+/* ray bounce depth */
+SD_VAR(int, ray_depth)
+
+/* ray transparent depth */
+SD_VAR(int, transparent_depth)
+
+#ifdef __RAY_DIFFERENTIALS__
+/* differential of P. these are orthogonal to Ng, not N */
+SD_VAR(differential3, dP)
+/* differential of I */
+SD_VAR(differential3, dI)
+/* differential of u, v */
+SD_VAR(differential, du)
+SD_VAR(differential, dv)
+#endif
+#ifdef __DPDU__
+/* differential of P w.r.t. parametric coordinates. note that dPdu is
+* not readily suitable as a tangent for shading on triangles. */
+SD_VAR(float3, dPdu)
+SD_VAR(float3, dPdv)
+#endif
+
+#ifdef __OBJECT_MOTION__
+/* object <-> world space transformations, cached to avoid
+* re-interpolating them constantly for shading */
+SD_VAR(Transform, ob_tfm)
+SD_VAR(Transform, ob_itfm)
+#endif
+
+/* Closure data, we store a fixed array of closures */
+SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE)
+SD_VAR(int, num_closure)
+SD_VAR(float, randb_closure)
+
+/* ray start position, only set for backgrounds */
+SD_VAR(float3, ray_P)
+SD_VAR(differential3, ray_dP)
+
+#ifdef __OSL__
+SD_VAR(struct KernelGlobals *, osl_globals)
+#endif
+
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 8923fcebee5..2811a8348ca 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -39,19 +39,6 @@ CCL_NAMESPACE_BEGIN
  * This is CPU only because of qsort, and malloc or high stack space usage to
  * record all these intersections. */
 
-ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
-{
-	const Intersection *isect_a = (const Intersection*)a;
-	const Intersection *isect_b = (const Intersection*)b;
-
-	if(isect_a->t < isect_b->t)
-		return -1;
-	else if(isect_a->t > isect_b->t)
-		return 1;
-	else
-		return 0;
-}
-
 #define STACK_MAX_HITS 64
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
@@ -95,7 +82,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 			PathState ps = *state;
 #endif
 
-			qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
+			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 
 			for(int hit = 0; hit < num_hits; hit++, isect++) {
 				/* adjust intersection distance for moving ray forward */
@@ -193,19 +180,36 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
  * potentially transparent, and only in that case start marching. this gives
  * one extra ray cast for the cases were we do want transparency. */
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow
+#ifdef __SPLIT_KERNEL__
+                                      , ShaderData *sd_mem, Intersection *isect_mem
+#endif
+                                      )
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 
-	if(ray->t == 0.0f)
+	if(ray_input->t == 0.0f)
 		return false;
 
-	Intersection isect;
-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#ifdef __SPLIT_KERNEL__
+	Ray private_ray = *ray_input;
+	Ray *ray = &private_ray;
+#else
+	Ray *ray = ray_input;
+#endif
+
+#ifdef __SPLIT_KERNEL__
+	Intersection *isect = isect_mem;
+#else
+	Intersection isect_object;
+	Intersection *isect = &isect_object;
+#endif
+
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
 
 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, &isect)) {
+		if(shader_transparent_shadow(kg, isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
 			int bounce = state->transparent_bounce;
@@ -217,9 +221,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
 
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
-
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
 					if(ps.volume_stack[0].shader != SHADER_NONE)
@@ -231,39 +234,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					return false;
 				}
 
-				if(!shader_transparent_shadow(kg, &isect))
+				if(!shader_transparent_shadow(kg, isect))
 					return true;
 
 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
 				if(ps.volume_stack[0].shader != SHADER_NONE) {
 					Ray segment_ray = *ray;
-					segment_ray.t = isect.t;
+					segment_ray.t = isect->t;
 					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
 				}
 #endif
 
 				/* setup shader data at surface */
-				ShaderData sd;
-				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
+#ifdef __SPLIT_KERNEL__
+				ShaderData *sd = sd_mem;
+#else
+				ShaderData sd_object;
+				ShaderData *sd = &sd_object;
+#endif
+				shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce);
 
 				/* attenuation from transparent surface */
-				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					throughput *= shader_bsdf_transparency(kg, &sd);
+				if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, sd);
 				}
 
 				if(is_zero(throughput))
 					return true;
 
 				/* move ray forward */
-				ray->P = ray_offset(sd.P, -sd.Ng);
+				ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 				if(ray->t != FLT_MAX)
 					ray->D = normalize_len(Pend - ray->P, &ray->t);
 
 #ifdef __VOLUME__
 				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
 #endif
 
 				bounce++;
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 374dc6d1dd9..f545a056cc8 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -24,6 +24,7 @@
 
 /* bvh */
 KERNEL_TEX(float4, texture_float4, __bvh_nodes)
+KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
 KERNEL_TEX(float4, texture_float4, __tri_woop)
 KERNEL_TEX(uint, texture_uint, __prim_type)
 KERNEL_TEX(uint, texture_uint, __prim_visibility)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 238b4b0bfdc..2a70bfcb8f0 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -24,6 +24,13 @@
 #define __KERNEL_CPU__
 #endif
 
+/* TODO(sergey): This is only to make it possible to include this header
+ * from outside of the kernel. but this could be done somewhat cleaner?
+ */
+#ifndef ccl_addr_space
+#define ccl_addr_space
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* constants */
@@ -38,12 +45,6 @@ CCL_NAMESPACE_BEGIN
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
 
-#define BB_DRAPER				800.0f
-#define BB_MAX_TABLE_RANGE		12000.0f
-#define BB_TABLE_XPOWER			1.5f
-#define BB_TABLE_YPOWER			5.0f
-#define BB_TABLE_SPACING		2.0f
-
 #define BECKMANN_TABLE_SIZE		256
 
 #define TEX_NUM_FLOAT_IMAGES	5
@@ -72,6 +73,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_DECOUPLED__
 #define __VOLUME_SCATTER__
 #define __SHADOW_RECORD_ALL__
+#define __VOLUME_RECORD_ALL__
 #endif
 
 #ifdef __KERNEL_CUDA__
@@ -82,7 +84,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#ifdef __KERNEL_EXPERIMENTAL__
 #define __SUBSURFACE__
 #define __CMJ__
 #endif
@@ -94,38 +96,44 @@ CCL_NAMESPACE_BEGIN
 /* keep __KERNEL_ADV_SHADING__ in sync with opencl_kernel_use_advanced_shading! */
 
 #ifdef __KERNEL_OPENCL_NVIDIA__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_APPLE__
-#define __KERNEL_SHADING__
+#  define __KERNEL_SHADING__
 //#define __KERNEL_ADV_SHADING__
 #endif
 
 #ifdef __KERNEL_OPENCL_AMD__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-//__KERNEL_ADV_SHADING__
-#define __MULTI_CLOSURE__
-#define __TRANSPARENT_SHADOWS__
-#define __PASSES__
-#define __BACKGROUND_MIS__
-#define __LAMP_MIS__
-#define __AO__
-//#define __CAMERA_MOTION__
-//#define __OBJECT_MOTION__
-//#define __HAIR__
-//end __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __MULTI_CLOSURE__
+#  define __PASSES__
+#  define __BACKGROUND_MIS__
+#  define __LAMP_MIS__
+#  define __AO__
+#  define __CAMERA_MOTION__
+#  define __OBJECT_MOTION__
+#  define __HAIR__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __TRANSPARENT_SHADOWS__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
-#endif
+#endif // __KERNEL_OPENCL__
 
 /* kernel features */
 #define __SOBOL__
@@ -164,6 +172,17 @@ CCL_NAMESPACE_BEGIN
 #  define __KERNEL_DEBUG__
 #endif
 
+/* Scene-based selective featrues compilation/ */
+#ifdef __NO_CAMERA_MOTION__
+#  undef __CAMERA_MOTION__
+#endif
+#ifdef __NO_OBJECT_MOTION__
+#  undef __OBJECT_MOTION__
+#endif
+#ifdef __NO_HAIR__
+#  undef __HAIR__
+#endif
+
 /* Random Numbers */
 
 typedef uint RNG;
@@ -269,9 +288,7 @@ enum PathRayFlag {
 
 	PATH_RAY_MIS_SKIP = 2048,
 	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
-	PATH_RAY_GLOSSY_ANCESTOR = 8192,
-	PATH_RAY_BSSRDF_ANCESTOR = 16384,
-	PATH_RAY_SINGLE_PASS_DONE = 32768,
+	PATH_RAY_SINGLE_PASS_DONE = 8192,
 
 	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
@@ -322,6 +339,8 @@ typedef enum PassType {
 	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
 #ifdef __KERNEL_DEBUG__
 	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
+	PASS_RAY_BOUNCES = (1 << 28),
 #endif
 } PassType;
 
@@ -329,7 +348,7 @@ typedef enum PassType {
 
 #ifdef __PASSES__
 
-typedef struct PathRadiance {
+typedef ccl_addr_space struct PathRadiance {
 	int use_light_pass;
 
 	float3 emission;
@@ -381,7 +400,7 @@ typedef struct BsdfEval {
 
 #else
 
-typedef float3 PathRadiance;
+typedef ccl_addr_space float3 PathRadiance;
 typedef float3 BsdfEval;
 
 #endif
@@ -426,6 +445,7 @@ enum CameraType {
 
 enum PanoramaType {
 	PANORAMA_EQUIRECTANGULAR,
+	PANORAMA_MIRRORBALL,
 	PANORAMA_FISHEYE_EQUIDISTANT,
 	PANORAMA_FISHEYE_EQUISOLID
 };
@@ -445,10 +465,26 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
+/* TODO(sergey): This is only needed because current AMD
+ * compiler has hard time building the kernel with this
+ * reshuffle. And at the same time reshuffle will cause
+ * less optimal CPU code in certain places.
+ *
+ * We'll get rid of this nasty exception once AMD compiler
+ * is fixed.
+ */
+#ifndef __KERNEL_OPENCL_AMD__
 	float3 P;		/* origin */
 	float3 D;		/* direction */
+
 	float t;		/* length of the ray */
 	float time;		/* time (for motion blur) */
+#else
+	float t;		/* length of the ray */
+	float time;		/* time (for motion blur) */
+	float3 P;		/* origin */
+	float3 D;		/* direction */
+#endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	differential3 dP;
@@ -458,7 +494,7 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef struct Intersection {
+typedef ccl_addr_space struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
@@ -466,6 +502,7 @@ typedef struct Intersection {
 
 #ifdef __KERNEL_DEBUG__
 	int num_traversal_steps;
+	int num_traversed_instances;
 #endif
 } Intersection;
 
@@ -543,7 +580,11 @@ typedef enum AttributeStandard {
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
-#define MAX_CLOSURE 64
+#  ifndef __MAX_CLOSURE__
+#     define MAX_CLOSURE 64
+#  else
+#    define MAX_CLOSURE __MAX_CLOSURE__
+#  endif
 #else
 #define MAX_CLOSURE 1
 #endif
@@ -553,7 +594,7 @@ typedef enum AttributeStandard {
  *   does not put own padding trying to align this members.
  * - We make sure OSL pointer is also 16 bytes aligned.
  */
-typedef struct ShaderClosure {
+typedef ccl_addr_space struct ShaderClosure {
 	float3 weight;
 	float3 N;
 	float3 T;
@@ -638,78 +679,23 @@ enum ShaderDataFlag {
 
 struct KernelGlobals;
 
-typedef struct ShaderData {
-	/* position */
-	float3 P;
-	/* smooth normal for shading */
-	float3 N;
-	/* true geometric normal */
-	float3 Ng;
-	/* view/incoming direction */
-	float3 I;
-	/* shader id */
-	int shader;
-	/* booleans describing shader, see ShaderDataFlag */
-	int flag;
-
-	/* primitive id if there is one, ~0 otherwise */
-	int prim;
-
-	/* combined type and curve segment for hair */
-	int type;
-
-	/* parametric coordinates
-	 * - barycentric weights for triangles */
-	float u, v;
-	/* object id if there is one, ~0 otherwise */
-	int object;
-
-	/* motion blur sample time */
-	float time;
-	
-	/* length of the ray being shaded */
-	float ray_length;
-	
-	/* ray bounce depth */
-	int ray_depth;
-
-	/* ray transparent depth */
-	int transparent_depth;
-
-#ifdef __RAY_DIFFERENTIALS__
-	/* differential of P. these are orthogonal to Ng, not N */
-	differential3 dP;
-	/* differential of I */
-	differential3 dI;
-	/* differential of u, v */
-	differential du;
-	differential dv;
-#endif
-#ifdef __DPDU__
-	/* differential of P w.r.t. parametric coordinates. note that dPdu is
-	 * not readily suitable as a tangent for shading on triangles. */
-	float3 dPdu, dPdv;
-#endif
-
-#ifdef __OBJECT_MOTION__
-	/* object <-> world space transformations, cached to avoid
-	 * re-interpolating them constantly for shading */
-	Transform ob_tfm;
-	Transform ob_itfm;
+#ifdef __SPLIT_KERNEL__
+#define SD_VAR(type, what) ccl_global type *what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type *what;
+#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0))
+#define ccl_fetch(s, t) (s->t[TIDX])
+#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index])
+#else
+#define SD_VAR(type, what) type what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure];
+#define ccl_fetch(s, t) (s->t)
+#define ccl_fetch_array(s, t, index) (&s->t[index])
 #endif
 
-	/* Closure data, we store a fixed array of closures */
-	ShaderClosure closure[MAX_CLOSURE];
-	int num_closure;
-	float randb_closure;
+typedef ccl_addr_space struct ShaderData {
 
-	/* ray start position, only set for backgrounds */
-	float3 ray_P;
-	differential3 ray_dP;
+#include "kernel_shaderdata_vars.h"
 
-#ifdef __OSL__
-	struct KernelGlobals *osl_globals;
-#endif
 } ShaderData;
 
 /* Path State */
@@ -867,7 +853,9 @@ typedef struct KernelFilm {
 
 #ifdef __KERNEL_DEBUG__
 	int pass_bvh_traversal_steps;
-	int pass_pad3, pass_pad4, pass_pad5;
+	int pass_bvh_traversed_instances;
+	int pass_ray_bounces;
+	int pass_pad3;
 #endif
 } KernelFilm;
 
@@ -895,6 +883,11 @@ typedef struct KernelIntegrator {
 	float inv_pdf_lights;
 	int pdf_background_res;
 
+	/* light portals */
+	float portal_pdf;
+	int num_portals;
+	int portal_offset;
+
 	/* bounces */
 	int min_bounce;
 	int max_bounce;
@@ -947,6 +940,8 @@ typedef struct KernelIntegrator {
 	int volume_max_steps;
 	float volume_step_size;
 	int volume_samples;
+
+	int pad;
 } KernelIntegrator;
 
 typedef struct KernelBVH {
@@ -980,9 +975,8 @@ typedef struct KernelCurves {
 } KernelCurves;
 
 typedef struct KernelTables {
-	int blackbody_offset;
 	int beckmann_offset;
-	int pad1, pad2;
+	int pad1, pad2, pad3;
 } KernelTables;
 
 typedef struct KernelData {
@@ -996,13 +990,64 @@ typedef struct KernelData {
 } KernelData;
 
 #ifdef __KERNEL_DEBUG__
-typedef struct DebugData {
+typedef ccl_addr_space struct DebugData {
 	// Total number of BVH node traversal steps and primitives intersections
 	// for the camera rays.
 	int num_bvh_traversal_steps;
+	int num_bvh_traversed_instances;
+	int num_ray_bounces;
 } DebugData;
 #endif
 
+/* Declarations required for split kernel */
+
+/* Macro for queues */
+/* Value marking queue's empty slot */
+#define QUEUE_EMPTY_SLOT -1
+
+/*
+* Queue 1 - Active rays
+* Queue 2 - Background queue
+* Queue 3 - Shadow ray cast kernel - AO
+* Queeu 4 - Shadow ray cast kernel - direct lighting
+*/
+#define NUM_QUEUES 4
+
+/* Queue names */
+enum QueueNumber {
+	QUEUE_ACTIVE_AND_REGENERATED_RAYS,         /* All active rays and regenerated rays are enqueued here */
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,      /* All
+	                                            * 1.Background-hit rays,
+	                                            * 2.Rays that has exited path-iteration but needs to update output buffer
+	                                            * 3.Rays to be regenerated
+	                                            * are enqueued here */
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,             /* All rays for which a shadow ray should be cast to determine radiance
+	                                              contribution for AO are enqueued here */
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,             /* All rays for which a shadow ray should be cast to determine radiance
+	                                              contributuin for direct lighting are enqueued here */
+};
+
+/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
+#define RAY_STATE_MASK 0x007
+#define RAY_FLAG_MASK 0x0F8
+enum RayState {
+	RAY_ACTIVE = 0,             // Denotes ray is actively involved in path-iteration
+	RAY_INACTIVE = 1,           // Denotes ray has completed processing all samples and is inactive
+	RAY_UPDATE_BUFFER = 2,      // Denoted ray has exited path-iteration and needs to update output buffer
+	RAY_HIT_BACKGROUND = 3,     // Donotes ray has hit background
+	RAY_TO_REGENERATE = 4,      // Denotes ray has to be regenerated
+	RAY_REGENERATED = 5,        // Denotes ray has been regenerated
+	RAY_SKIP_DL = 6,            // Denotes ray should skip direct lighting
+	RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part
+	RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part
+};
+
+#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
+#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
+#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
+#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 0300e1d4c7f..e06568457c6 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -627,7 +627,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step_size = kernel_data.integrator.volume_step_size;
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
-		if (max_steps > global_max_steps) {
+		if(max_steps > global_max_steps) {
 			max_steps = global_max_steps;
 			step_size = ray->t / (float)max_steps;
 		}
@@ -993,6 +993,48 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	volume_ray.t = FLT_MAX;
 
 	int stack_index = 0, enclosed_index = 0;
+
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		int enclosed_volumes[VOLUME_STACK_SIZE];
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
+			if(sd.flag & SD_BACKFACING) {
+				/* If ray exited the volume and never entered to that volume
+				 * it means that camera is inside such a volume.
+				 */
+				bool is_enclosed = false;
+				for(int i = 0; i < enclosed_index; ++i) {
+					if(enclosed_volumes[i] == sd.object) {
+						is_enclosed = true;
+						break;
+					}
+				}
+				if(is_enclosed == false) {
+					stack[stack_index].object = sd.object;
+					stack[stack_index].shader = sd.shader;
+					++stack_index;
+				}
+			}
+			else {
+				/* If ray from camera enters the volume, this volume shouldn't
+				 * be added to the stack on exit.
+				 */
+				enclosed_volumes[enclosed_index++] = sd.object;
+			}
+		}
+	}
+#else
 	int enclosed_volumes[VOLUME_STACK_SIZE];
 	int step = 0;
 
@@ -1035,6 +1077,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 		volume_ray.P = ray_offset(sd.P, -sd.Ng);
 		++step;
 	}
+#endif
 	/* stack_index of 0 means quick checks outside of the kernel gave false
 	 * positive, nothing to worry about, just we've wasted quite a few of
 	 * ticks just to come into conclusion that camera is in the air.
@@ -1097,4 +1140,49 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 	}
 }
 
+#ifdef __SUBSURFACE__
+ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
+                                                          Ray *ray,
+                                                          VolumeStack *stack)
+{
+	kernel_assert(kernel_data.integrator.use_volumes);
+
+	Ray volume_ray = *ray;
+
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
+			kernel_volume_stack_enter_exit(kg, &sd, stack);
+		}
+	}
+#else
+	Intersection isect;
+	int step = 0;
+	while(step < 2 * VOLUME_STACK_SIZE &&
+	      scene_intersect_volume(kg, &volume_ray, &isect))
+	{
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		kernel_volume_stack_enter_exit(kg, &sd, stack);
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		volume_ray.t -= sd.ray_length;
+		++step;
+	}
+#endif
+}
+#endif
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
new file mode 100644
index 00000000000..9b83d972e97
--- /dev/null
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+                                 uint tile_dim_x,
+                                 uint tile_dim_y,
+                                 uint parallel_samples,
+                                 int dim)
+{
+	if(dim == 0) {
+		uint x_span = ray_index % (tile_dim_x * parallel_samples);
+		return x_span / get_local_size(0);
+	}
+	else /*if(dim == 1)*/ {
+		kernel_assert(dim == 1);
+		uint y_span = ray_index / (tile_dim_x * parallel_samples);
+		return y_span / get_local_size(1);
+	}
+}
+
+uint get_total_work(uint tile_dim_x,
+                    uint tile_dim_y,
+                    uint grp_idx,
+                    uint grp_idy,
+                    uint num_samples)
+{
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return threads_within_tile_border_x *
+	       threads_within_tile_border_y *
+	       num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+                  ccl_private uint *my_work,
+                  uint tile_dim_x,
+                  uint tile_dim_y,
+                  uint num_samples,
+                  uint parallel_samples,
+                  uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint total_work = get_total_work(tile_dim_x,
+	                                 tile_dim_y,
+	                                 grp_idx,
+	                                 grp_idy,
+	                                 num_samples);
+	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+	*my_work = atomic_inc(&work_pool[group_index]);
+	return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+                   uint tile_dim_x,
+                   uint tile_dim_y,
+                   uint parallel_samples,
+                   uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return my_work /
+	       (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+                             ccl_private uint *pixel_y,
+                             ccl_private uint *tile_x,
+                             ccl_private uint *tile_y,
+                             uint my_work,
+                             uint tile_dim_x,
+                             uint tile_dim_y,
+                             uint tile_offset_x,
+                             uint tile_offset_y,
+                             uint parallel_samples,
+                             uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	uint total_associated_pixels =
+		threads_within_tile_border_x * threads_within_tile_border_y;
+	uint work_group_pixel_index = my_work % total_associated_pixels;
+	uint work_group_pixel_x =
+		work_group_pixel_index % threads_within_tile_border_x;
+	uint work_group_pixel_y =
+		work_group_pixel_index / threads_within_tile_border_x;
+
+	*pixel_x =
+		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+	*pixel_y =
+		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+	*tile_x = *pixel_x - tile_offset_x;
+	*tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif  /* __WORK_STEALING__ */
+
+#endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 013eeff57fa..37a73ab2f04 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -23,6 +23,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
@@ -55,7 +56,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_float_"));
 		int array_index = id;
 
-		if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
 			tex = &kg->texture_float_images[array_index];
 		}
 
@@ -70,7 +71,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_"));
 		int array_index = id - MAX_FLOAT_IMAGES;
 
-		if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
 			tex = &kg->texture_byte_images[array_index];
 		}
 
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index f1027ad413d..df77bedc729 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -38,6 +38,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index b2f16ff54d8..b3192369794 100644
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -39,6 +39,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index cc8c603e8f8..f9c5134e442 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -34,6 +34,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index 20919a4f26e..2dbe4b81821 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -36,6 +36,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 48579d3b7e5..5c57ad01181 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -37,6 +37,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 64069fc049f..bcd55b8c676 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,13 +16,14 @@
 
 /* CUDA kernel entry points */
 
-#include "kernel_compat_cuda.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_compat_cuda.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
 
 /* device data taken from CUDA occupancy calculator */
 
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 5a47260a4ee..15fb34cfe3b 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -16,14 +16,17 @@
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
 
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
+
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
@@ -32,7 +35,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -43,7 +46,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
 	int y = sy + get_global_id(1);
@@ -52,17 +55,18 @@ __kernel void kernel_ocl_path_trace(
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_convert_to_byte(
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -70,26 +74,24 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
-__kernel void kernel_ocl_convert_to_half_float(
+__kernel void kernel_ocl_bake(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -97,25 +99,36 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw) {
+#if defined(__KERNEL_OPENCL_NVIDIA__) && __COMPUTE_CAPABILITY__ < 300
+		/* NVidia compiler is spending infinite amount of time trying
+		 * to deal with kernel_bake_evaluate() on architectures prior
+		 * to sm_30.
+		 * For now we disable baking kernel for those devices, so at
+		 * least rendering with split kernel could be compiled.
+		 */
+		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+#endif
+	}
 }
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_convert_to_byte(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -123,24 +136,26 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_bake(
+__kernel void kernel_ocl_convert_to_half_float(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -148,11 +163,13 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+#endif // __COMPILE_ONLY_MEGAKERNEL__
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..eff77b89a0a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_background_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        ccl_global int *Queue_data,            /* Queues memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag =
+			kernel_background_buffer_update(globals,
+			                                data,
+			                                shader_data,
+			                                per_sample_output_buffers,
+			                                rng_state,
+			                                rng_coop,
+			                                throughput_coop,
+			                                PathRadiance_coop,
+			                                Ray_coop,
+			                                PathState_coop,
+			                                L_transparent_coop,
+			                                ray_state,
+			                                sw, sh, sx, sy, stride,
+			                                rng_state_offset_x,
+			                                rng_state_offset_y,
+			                                rng_state_stride,
+			                                work_array,
+			                                end_sample,
+			                                start_sample,
+#ifdef __WORK_STEALING__
+			                                work_pool_wgs,
+			                                num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+			                                debugdata_coop,
+#endif
+			                                parallel_samples,
+			                                ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
new file mode 100644
index 00000000000..c3277676029
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_data_init.h"
+
+__kernel void kernel_ocl_path_trace_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+	kernel_data_init(globals,
+	                 shader_data_sd,
+	                 shader_data_sd_DL_shadow,
+	                 P_sd,
+	                 P_sd_DL_shadow,
+	                 N_sd,
+	                 N_sd_DL_shadow,
+	                 Ng_sd,
+	                 Ng_sd_DL_shadow,
+	                 I_sd,
+	                 I_sd_DL_shadow,
+	                 shader_sd,
+	                 shader_sd_DL_shadow,
+	                 flag_sd,
+	                 flag_sd_DL_shadow,
+	                 prim_sd,
+	                 prim_sd_DL_shadow,
+	                 type_sd,
+	                 type_sd_DL_shadow,
+	                 u_sd,
+	                 u_sd_DL_shadow,
+	                 v_sd,
+	                 v_sd_DL_shadow,
+	                 object_sd,
+	                 object_sd_DL_shadow,
+	                 time_sd,
+	                 time_sd_DL_shadow,
+	                 ray_length_sd,
+	                 ray_length_sd_DL_shadow,
+	                 ray_depth_sd,
+	                 ray_depth_sd_DL_shadow,
+	                 transparent_depth_sd,
+	                 transparent_depth_sd_DL_shadow,
+
+	                 /* Ray differentials. */
+	                 dP_sd,
+	                 dP_sd_DL_shadow,
+	                 dI_sd,
+	                 dI_sd_DL_shadow,
+	                 du_sd,
+	                 du_sd_DL_shadow,
+	                 dv_sd,
+	                 dv_sd_DL_shadow,
+
+	                 /* Dp/Du */
+	                 dPdu_sd,
+	                 dPdu_sd_DL_shadow,
+	                 dPdv_sd,
+	                 dPdv_sd_DL_shadow,
+
+	                 /* Object motion. */
+	                 ob_tfm_sd,
+	                 ob_tfm_sd_DL_shadow,
+	                 ob_itfm_sd,
+	                 ob_itfm_sd_DL_shadow,
+
+	                 closure_sd,
+	                 closure_sd_DL_shadow,
+	                 num_closure_sd,
+	                 num_closure_sd_DL_shadow,
+	                 randb_closure_sd,
+	                 randb_closure_sd_DL_shadow,
+	                 ray_P_sd,
+	                 ray_P_sd_DL_shadow,
+	                 ray_dP_sd,
+	                 ray_dP_sd_DL_shadow,
+	                 data,
+	                 per_sample_output_buffers,
+	                 rng_state,
+	                 rng_coop,
+	                 throughput_coop,
+	                 L_transparent_coop,
+	                 PathRadiance_coop,
+	                 Ray_coop,
+	                 PathState_coop,
+	                 ray_state,
+
+#define KERNEL_TEX(type, ttype, name) name,
+#include "../../kernel_textures.h"
+
+	                 start_sample, sx, sy, sw, sh, offset, stride,
+	                 rng_state_offset_x,
+	                 rng_state_offset_y,
+	                 rng_state_stride,
+	                 Queue_data,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_array,
+#ifdef __WORK_STEALING__
+	                 work_pool_wgs,
+	                 num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                 debugdata_coop,
+#endif
+	                 parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..6ec75013b3a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_direct_lighting.h"
+
+__kernel void kernel_ocl_path_trace_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global int *Queue_data,             /* Queue memory */
+        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+        int queuesize)                          /* Size (capacity) of each queue */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_direct_lighting(globals,
+		                                      data,
+		                                      shader_data,
+		                                      shader_DL,
+		                                      rng_coop,
+		                                      PathState_coop,
+		                                      ISLamp_coop,
+		                                      LightRay_coop,
+		                                      BSDFEval_coop,
+		                                      ray_state,
+		                                      ray_index);
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..ae5f5cd1b3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		kernel_holdout_emission_blurring_pathtermination_ao(
+		        globals,
+		        data,
+		        shader_data,
+		        per_sample_output_buffers,
+		        rng_coop,
+		        throughput_coop,
+		        L_transparent_coop,
+		        PathRadiance_coop,
+		        PathState_coop,
+		        Intersection_coop,
+		        AOAlpha_coop,
+		        AOBSDF_coop,
+		        AOLightRay_coop,
+		        sw, sh, sx, sy, stride,
+		        ray_state,
+		        work_array,
+#ifdef __WORK_STEALING__
+		        start_sample,
+#endif
+		        parallel_samples,
+		        ray_index,
+		        &enqueue_flag,
+		        &enqueue_flag_AO_SHADOW_RAY_CAST);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics_bg,
+	                        Queue_data,
+	                        Queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        queuesize,
+	                        &local_queue_atomics_ao,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..1bc7808d834
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_lamp_emission.h"
+
+__kernel void kernel_ocl_path_trace_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_lamp_emission(globals,
+	                     data,
+	                     shader_data,
+	                     throughput_coop,
+	                     PathRadiance_coop,
+	                     Ray_coop,
+	                     PathState_coop,
+	                     Intersection_coop,
+	                     ray_state,
+	                     sw, sh,
+	                     use_queues_flag,
+	                     parallel_samples,
+	                     ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..dcf4db40411
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_next_iteration_setup.h"
+
+__kernel void kernel_ocl_path_trace_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global int *Queue_data,           /* Queue memory */
+        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
+        int queuesize,                        /* Size (capacity) of each queue */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_next_iteration_setup(globals,
+		                                           data,
+		                                           shader_data,
+		                                           rng_coop,
+		                                           throughput_coop,
+		                                           PathRadiance_coop,
+		                                           Ray_coop,
+		                                           PathState_coop,
+		                                           LightRay_dl_coop,
+		                                           ISLamp_coop,
+		                                           BSDFEval_coop,
+		                                           LightRay_ao_coop,
+		                                           AOBSDF_coop,
+		                                           AOAlpha_coop,
+		                                           ray_state,
+		                                           use_queues_flag,
+		                                           ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..3156dc255fb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_queues.h"
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+        ccl_global int *Queue_data,   /* Queue memory */
+        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+        ccl_global char *ray_state,   /* Denotes the state of each ray */
+        int queuesize)                /* Size (capacity) of each queue */
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  queuesize,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..e5fad7bce50
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_scene_intersect.h"
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_scene_intersect(globals,
+	                       data,
+	                       rng_coop,
+	                       Ray_coop,
+	                       PathState_coop,
+	                       Intersection_coop,
+	                       ray_state,
+	                       sw, sh,
+	                       use_queues_flag,
+#ifdef __KERNEL_DEBUG__
+	                       debugdata_coop,
+#endif
+	                       parallel_samples,
+	                       ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
new file mode 100644
index 00000000000..b9f616e6bdf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shader_eval.h"
+
+__kernel void kernel_ocl_path_trace_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global int *Queue_data,            /* queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize)                         /* Size (capacity) of each queue */
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+
+	/* Continue on with shader evaluation. */
+	kernel_shader_eval(globals,
+	                   data,
+	                   shader_data,
+	                   rng_coop,
+	                   Ray_coop,
+	                   PathState_coop,
+	                   Intersection_coop,
+	                   ray_state,
+	                   ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..03886c0a030
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shadow_blocked.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int total_num_rays)
+{
+#if 0
+	/* We will make the Queue_index entries '0' in the next kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	kernel_shadow_blocked(globals,
+	                      data,
+	                      shader_shadow,
+	                      PathState_coop,
+	                      LightRay_dl_coop,
+	                      LightRay_ao_coop,
+	                      Intersection_coop_AO,
+	                      Intersection_coop_DL,
+	                      ray_state,
+	                      total_num_rays,
+	                      shadow_blocked_type,
+	                      ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..88a1ed830af
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_sum_all_radiance.h"
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	kernel_sum_all_radiance(data,
+	                        buffer,
+	                        per_sample_output_buffer,
+	                        parallel_samples,
+	                        sw, sh, stride,
+	                        buffer_offset_x,
+	                        buffer_offset_y,
+	                        buffer_stride,
+	                        start_sample);
+}
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index 58b0204a1b9..74ba5e1020c 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -44,6 +44,18 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
 defs.append('WITH_OSL')
 
+if env['WITH_UNORDERED_MAP_SUPPORT']:
+    if env['UNORDERED_MAP_HEADER'] == 'unordered_map':
+        if env['UNORDERED_MAP_NAMESPACE'] == 'std':
+            defs.append('CYCLES_STD_UNORDERED_MAP')
+        elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+            defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE')
+    elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+        defs.append('CYCLES_TR1_UNORDERED_MAP')
+else:
+    print("-- Replacing unordered_map/set with map/set (warning: slower!)")
+    defs.append('CYCLES_NO_UNORDERED_MAP')
+
 if env['WITH_BF_CYCLES_DEBUG']:
     defs.append('WITH_CYCLES_DEBUG')
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 8f9c2efd470..43929fbe928 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index c5851747b54..497c4f0dc5c 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 84ef85e089d..88998037751 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_bssrdf.h"
 #include "osl_closures.h"
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 5e833d738d8..ef67ef52fc0 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -147,14 +147,14 @@ public: \
 \
 	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	int sample(const float3 &Ng, \
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 1f6015d0d6b..3c1955a1e1e 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -20,6 +20,7 @@
  */
 #if defined(__GNUC__) && defined(NDEBUG)
 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
 #include <string.h>
@@ -138,12 +139,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm;
 
@@ -168,12 +169,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform itfm;
 
@@ -198,27 +199,27 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_world) {
+	else if(from == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -230,27 +231,27 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_world) {
+	else if(to == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -262,11 +263,11 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_tfm;
 #else
@@ -287,11 +288,11 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_itfm;
 #else
@@ -312,22 +313,22 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -340,22 +341,22 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 	
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -373,8 +374,8 @@ bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivat
 
 static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 
@@ -382,7 +383,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		fval[1] = f[0].y;
 		fval[2] = f[0].z;
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1].x;
 			fval[4] = f[1].y;
 			fval[5] = f[1].z;
@@ -398,7 +399,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		float *fval = (float *)val;
 		fval[0] = average(f[0]);
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = average(f[1]);
 			fval[2] = average(f[2]);
 		}
@@ -422,15 +423,15 @@ static bool set_attribute_float3(float3 f, TypeDesc type, bool derivatives, void
 
 static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 		fval[0] = f[0];
 		fval[1] = f[1];
 		fval[2] = f[2];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1];
 			fval[4] = f[1];
 			fval[5] = f[1];
@@ -446,7 +447,7 @@ static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, voi
 		float *fval = (float *)val;
 		fval[0] = f[0];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = f[1];
 			fval[2] = f[2];
 		}
@@ -474,7 +475,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
 		int *ival = (int *)val;
 		ival[0] = i;
 
-		if (derivatives) {
+		if(derivatives) {
 			ival[1] = 0;
 			ival[2] = 0;
 		}
@@ -491,7 +492,7 @@ static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, v
 		ustring *sval = (ustring *)val;
 		sval[0] = str;
 
-		if (derivatives) {
+		if(derivatives) {
 			sval[1] = OSLRenderServices::u_empty;
 			sval[2] = OSLRenderServices::u_empty;
 		}
@@ -521,7 +522,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives,
 
 		if(type.arraylen > 3)
 			memset(fval + 3*3, 0, sizeof(float)*3*(type.arraylen - 3));
-		if (derivatives)
+		if(derivatives)
 			memset(fval + type.arraylen*3, 0, sizeof(float)*2*3*type.arraylen);
 
 		return true;
@@ -544,15 +545,15 @@ static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val)
 static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-	    attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
+	if(attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
+	   attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
 	{
 		float3 fval[3];
 		fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset,
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float3(fval, type, derivatives, val);
 	}
-	else if (attr.type == TypeDesc::TypeFloat) {
+	else if(attr.type == TypeDesc::TypeFloat) {
 		float fval[3];
 		fval[0] = primitive_attribute_float(kg, sd, attr.elem, attr.offset,
 		                                    (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
@@ -566,7 +567,7 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd,
 static bool get_mesh_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypeMatrix) {
+	if(attr.type == TypeDesc::TypeMatrix) {
 		Transform tfm = primitive_attribute_matrix(kg, sd, attr.offset);
 		return set_attribute_matrix(tfm, type, val);
 	}
@@ -580,7 +581,7 @@ static void get_object_attribute(const OSLGlobals::Attribute& attr, bool derivat
 	size_t datasize = attr.value.datasize();
 
 	memcpy(val, attr.value.data(), datasize);
-	if (derivatives)
+	if(derivatives)
 		memset((char *)val + datasize, 0, datasize * 2);
 }
 
@@ -590,80 +591,80 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 	/* todo: turn this into hash table? */
 
 	/* Object Attributes */
-	if (name == u_object_location) {
+	if(name == u_object_location) {
 		float3 f = object_location(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_object_index) {
+	else if(name == u_object_index) {
 		float f = object_pass_id(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_generated) {
+	else if(name == u_geom_dupli_generated) {
 		float3 f = object_dupli_generated(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_uv) {
+	else if(name == u_geom_dupli_uv) {
 		float3 f = object_dupli_uv(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_material_index) {
+	else if(name == u_material_index) {
 		float f = shader_pass_id(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_object_random) {
+	else if(name == u_object_random) {
 		float f = object_random_number(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 
 	/* Particle Attributes */
-	else if (name == u_particle_index) {
+	else if(name == u_particle_index) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_index(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_age) {
+	else if(name == u_particle_age) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_age(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_lifetime) {
+	else if(name == u_particle_lifetime) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_lifetime(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_location) {
+	else if(name == u_particle_location) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_location(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 #if 0	/* unsupported */
-	else if (name == u_particle_rotation) {
+	else if(name == u_particle_rotation) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float4 f = particle_rotation(kg, particle_id);
 		return set_attribute_float4(f, type, derivatives, val);
 	}
 #endif
-	else if (name == u_particle_size) {
+	else if(name == u_particle_size) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_size(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_velocity) {
+	else if(name == u_particle_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_particle_angular_velocity) {
+	else if(name == u_particle_angular_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_angular_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 	
 	/* Geometry Attributes */
-	else if (name == u_geom_numpolyvertices) {
+	else if(name == u_geom_numpolyvertices) {
 		return set_attribute_int(3, type, derivatives, val);
 	}
-	else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices)
+	else if((name == u_geom_trianglevertices || name == u_geom_polyvertices)
 #ifdef __HAIR__
 		     && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
@@ -689,21 +690,21 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		ustring object_name = kg->osl->object_names[sd->object];
 		return set_attribute_string(object_name, type, derivatives, val);
 	}
-	else if (name == u_is_smooth) {
+	else if(name == u_is_smooth) {
 		float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 #ifdef __HAIR__
 	/* Hair Attributes */
-	else if (name == u_is_curve) {
+	else if(name == u_is_curve) {
 		float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_thickness) {
+	else if(name == u_curve_thickness) {
 		float f = curve_thickness(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_tangent_normal) {
+	else if(name == u_curve_tangent_normal) {
 		float3 f = curve_tangent_normal(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
@@ -715,22 +716,22 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
                                                  TypeDesc type, bool derivatives, void *val)
 {
-	if (name == u_path_ray_length) {
+	if(name == u_path_ray_length) {
 		/* Ray Length */
 		float f = sd->ray_length;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_path_ray_depth) {
+	else if(name == u_path_ray_depth) {
 		/* Ray Depth */
 		int f = sd->ray_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_path_transparent_depth) {
+	else if(name == u_path_transparent_depth) {
 		/* Transparent Ray Depth */
 		int f = sd->transparent_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_ndc) {
+	else if(name == u_ndc) {
 		/* NDC coordinates with special exception for otho */
 		OSLThreadData *tdata = kg->osl_tdata;
 		OSL::ShaderGlobals *globals = &tdata->globals;
@@ -762,7 +763,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if (sg->renderstate == NULL)
+	if(sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -777,10 +778,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	int object;
 
 	/* lookup of attribute on another object */
-	if (object_name != u_empty) {
+	if(object_name != u_empty) {
 		OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name);
 
-		if (it == kg->osl->object_name_map.end())
+		if(it == kg->osl->object_name_map.end())
 			return false;
 
 		object = it->second;
@@ -790,7 +791,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		object = sd->object;
 		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 
-		if (object == OBJECT_NONE)
+		if(object == OBJECT_NONE)
 			return get_background_attribute(kg, sd, name, type, derivatives, val);
 	}
 
@@ -799,10 +800,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object];
 	OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
 
-	if (it != attribute_map.end()) {
+	if(it != attribute_map.end()) {
 		const OSLGlobals::Attribute& attr = it->second;
 
-		if (attr.elem != ATTR_ELEMENT_OBJECT) {
+		if(attr.elem != ATTR_ELEMENT_OBJECT) {
 			/* triangle and vertex attributes */
 			if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val))
 				return true;
@@ -819,7 +820,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		/* not found in attribute, check standard object info */
 		bool is_std_object_attribute = get_object_standard_attribute(kg, sd, name, type, derivatives, val);
 
-		if (is_std_object_attribute)
+		if(is_std_object_attribute)
 			return true;
 
 		return get_background_attribute(kg, sd, name, type, derivatives, val);
@@ -887,7 +888,7 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 #endif
 	bool status;
 
-	if(filename[0] == '@' && filename.find('.') == -1) {
+	if(filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
@@ -939,19 +940,33 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
 	KernelGlobals *kg = sd->osl_globals;
-	OSLThreadData *tdata = kg->osl_tdata;
-	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
-
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	bool status;
+	if(filename[0] == '@') {
+		int slot = atoi(filename.c_str() + 1);
+		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
 
+		result[0] = rgba[0];
+		if(nchannels > 1)
+			result[1] = rgba[1];
+		if(nchannels > 2)
+			result[2] = rgba[2];
+		if(nchannels > 3)
+			result[3] = rgba[3];
+		status = true;
+	}
+	else {
+		OSLThreadData *tdata = kg->osl_tdata;
+		OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
+		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
 #if OIIO_VERSION < 10500
-	bool status = ts->texture3d(th, thread_info,
-	                            options, P, dPdx, dPdy, dPdz, result);
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz, result);
 #else
-	bool status = ts->texture3d(th, thread_info,
-	                            options, P, dPdx, dPdy, dPdz,
-	                            nchannels, result);
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz,
+		                       nchannels, result);
 #endif
+	}
 
 	if(!status) {
 		if(nchannels == 3 || nchannels == 4) {
@@ -979,7 +994,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	OSLThreadData *tdata = kg->osl_tdata;
 	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
 
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
 
 #if OIIO_VERSION < 10500
 	bool status = ts->environment(th, thread_info,
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index ebf72ae11f4..8cfe0cbcbd4 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -146,11 +146,11 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
+	if(closure->type == OSL::ClosureColor::COMPONENT) {
 		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
 		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
+		if(prim) {
 			ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
@@ -267,6 +267,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 						if(fabsf(weight.x) > 0.0f) {
 							sc.weight = make_float3(weight.x, 0.0f, 0.0f);
 							sc.data0 = bssrdf->radius.x;
+							sc.data1 = 0.0f;
 							sd->flag |= bssrdf_setup(&sc, sc.type);
 							sd->closure[sd->num_closure++] = sc;
 						}
@@ -274,6 +275,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 						if(fabsf(weight.y) > 0.0f) {
 							sc.weight = make_float3(0.0f, weight.y, 0.0f);
 							sc.data0 = bssrdf->radius.y;
+							sc.data1 = 0.0f;
 							sd->flag |= bssrdf_setup(&sc, sc.type);
 							sd->closure[sd->num_closure++] = sc;
 						}
@@ -281,6 +283,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 						if(fabsf(weight.z) > 0.0f) {
 							sc.weight = make_float3(0.0f, 0.0f, weight.z);
 							sc.data0 = bssrdf->radius.z;
+							sc.data1 = 0.0f;
 							sd->flag |= bssrdf_setup(&sc, sc.type);
 							sd->closure[sd->num_closure++] = sc;
 						}
@@ -293,11 +296,11 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 			}
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
+	else if(closure->type == OSL::ClosureColor::MUL) {
 		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
 		flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
 	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
+	else if(closure->type == OSL::ClosureColor::ADD) {
 		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 		flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
 		flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
@@ -316,11 +319,11 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, S
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->surface_state[shader])
+	if(kg->osl->surface_state[shader])
 		ss->execute(*octx, *(kg->osl->surface_state[shader]), *globals);
 
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_surface_closure_tree(sd, path_flag, globals->Ci);
 }
 
@@ -332,23 +335,23 @@ static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure)
 	 * is only one supported closure type at the moment, which has no evaluation
 	 * functions, so we just sum the weights */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
+	if(closure->type == OSL::ClosureColor::COMPONENT) {
 		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
 		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim && prim->category == CClosurePrimitive::Background)
+		if(prim && prim->category == CClosurePrimitive::Background)
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
 			return TO_FLOAT3(comp->w);
 #else
 			return make_float3(1.0f, 1.0f, 1.0f);
 #endif
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
+	else if(closure->type == OSL::ClosureColor::MUL) {
 		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
 
 		return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
 	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
+	else if(closure->type == OSL::ClosureColor::ADD) {
 		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 
 		return flatten_background_closure_tree(add->closureA) +
@@ -369,11 +372,11 @@ float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, int path_fl
 	OSL::ShaderGlobals *globals = &tdata->globals;
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 
-	if (kg->osl->background_state)
+	if(kg->osl->background_state)
 		ss->execute(*octx, *(kg->osl->background_state), *globals);
 
 	/* return background color immediately */
-	if (globals->Ci)
+	if(globals->Ci)
 		return flatten_background_closure_tree(globals->Ci);
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -387,11 +390,11 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
+	if(closure->type == OSL::ClosureColor::COMPONENT) {
 		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
 		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
+		if(prim) {
 			ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
@@ -448,11 +451,11 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 			}
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
+	else if(closure->type == OSL::ClosureColor::MUL) {
 		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
 		flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
 	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
+	else if(closure->type == OSL::ClosureColor::ADD) {
 		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 		flatten_volume_closure_tree(sd, add->closureA, weight);
 		flatten_volume_closure_tree(sd, add->closureB, weight);
@@ -471,11 +474,11 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, Sh
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->volume_state[shader])
+	if(kg->osl->volume_state[shader])
 		ss->execute(*octx, *(kg->osl->volume_state[shader]), *globals);
 	
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_volume_closure_tree(sd, globals->Ci);
 }
 
@@ -493,7 +496,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->displacement_state[shader])
+	if(kg->osl->displacement_state[shader])
 		ss->execute(*octx, *(kg->osl->displacement_state[shader]), *globals);
 
 	/* get back position */
@@ -520,7 +523,7 @@ float3 OSLShader::bsdf_eval(const ShaderData *sd, const ShaderClosure *sc, const
 	CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim;
 	float3 bsdf_eval;
 
-	if (dot(sd->Ng, omega_in) >= 0.0f)
+	if(dot(sd->Ng, omega_in) >= 0.0f)
 		bsdf_eval = bsdf->eval_reflect(sd->I, omega_in, pdf);
 	else
 		bsdf_eval = bsdf->eval_transmit(sd->I, omega_in, pdf);
@@ -548,7 +551,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 	ustring stdname(std::string("geom:") + std::string(Attribute::standard_name((AttributeStandard)id)));
 	OSLGlobals::AttributeMap::const_iterator it = attr_map.find(stdname);
 
-	if (it != attr_map.end()) {
+	if(it != attr_map.end()) {
 		const OSLGlobals::Attribute &osl_attr = it->second;
 		*elem = osl_attr.elem;
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 526a87525cd..46a02cab32e 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -26,7 +26,7 @@ point map_to_tube(vector dir)
 {
 	float u, v;
 	v = (dir[2] + 1.0) * 0.5;
-	float len = sqrt(dir[0]*dir[0] + dir[1]*dir[1]);
+	float len = sqrt(dir[0] * dir[0] + dir[1] * dir[1]);
 	if (len > 0.0) {
 		u = (1.0 - (atan2(dir[0] / len, dir[1] / len) / M_PI)) * 0.5;
 	}
@@ -40,8 +40,8 @@ point map_to_sphere(vector dir)
 {
 	float len = length(dir);
 	float v, u;
-	if(len > 0.0) {
-		if(dir[0] == 0.0 && dir[1] == 0.0) {
+	if (len > 0.0) {
+		if (dir[0] == 0.0 && dir[1] == 0.0) {
 			u = 0.0;  /* Othwise domain error. */
 		}
 		else {
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index bbc008b4299..7eef97fd7e8 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -93,8 +93,8 @@ shader node_math(
 		Value = Value1 > Value2;
 	else if (type == "Modulo")
 		Value = safe_modulo(Value1, Value2);
-    else if (type == "Absolute")
-        Value = fabs(Value1);
+	else if (type == "Absolute")
+		Value = fabs(Value1);
 
 	if (Clamp)
 		Value = clamp(Value, 0.0, 1.0);
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index a349dc8cb9a..4f95dec910a 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -26,7 +26,7 @@
  * from "Texturing and Modelling: A procedural approach"
  */
 
-float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_fBm(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0;
@@ -54,7 +54,7 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
  * octaves: number of frequencies in the fBm
  */
 
-float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_multi_fractal(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0;
@@ -83,7 +83,7 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacunarity, float octaves, float offset)
+float noise_musgrave_hetero_terrain(point p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = pow(lacunarity, -H);
@@ -118,8 +118,8 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_hybrid_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = pow(lacunarity, -H);
@@ -156,8 +156,8 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_ridged_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = pow(lacunarity, -H);
@@ -201,7 +201,6 @@ shader node_musgrave_texture(
 	float dimension = max(Dimension, 1e-5);
 	float octaves = clamp(Detail, 0.0, 16.0);
 	float lacunarity = max(Lacunarity, 1e-5);
-	string Basis = "Perlin";
 	float intensity = 1.0;
 
 	point p = Vector;
@@ -212,15 +211,15 @@ shader node_musgrave_texture(
 	p = p * Scale;
 
 	if (Type == "Multifractal")
-		Fac = intensity * noise_musgrave_multi_fractal(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if (Type == "fBM")
-		Fac = intensity * noise_musgrave_fBm(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if (Type == "Hybrid Multifractal")
-		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Ridged Multifractal")
-		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Hetero Terrain")
-		Fac = intensity * noise_musgrave_hetero_terrain(p, Basis, dimension, lacunarity, octaves, Offset);
+		Fac = intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, Offset);
 	
 	Color = color(Fac, Fac, Fac);
 }
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index dabc0b6843f..e83e5b5b211 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -19,23 +19,23 @@
 
 /* Noise */
 
-float noise(point p, string basis, float distortion, float detail, float fac, color Color)
+float noise(point p, float distortion, float detail, float fac, color Color)
 {
 	point r;
 	int hard = 0;
 
 	if (distortion != 0.0) {
-		r[0] = noise_basis(p + point(13.5), basis) * distortion;
-		r[1] = noise_basis(p, basis) * distortion;
-		r[2] = noise_basis(p - point(13.5), basis) * distortion;
+		r[0] = safe_noise(p + point(13.5), "unsigned") * distortion;
+		r[1] = safe_noise(p, "unsigned") * distortion;
+		r[2] = safe_noise(p - point(13.5), "unsigned") * distortion;
 		
 		p += r;
 	}
 
-	fac = noise_turbulence(p, basis, detail, hard);
+	fac = noise_turbulence(p, detail, hard);
 	
-	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), basis, detail, hard),
-		noise_turbulence(point(p[1], p[2], p[0]), basis, detail, hard));
+	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), detail, hard),
+		noise_turbulence(point(p[1], p[2], p[0]), detail, hard));
 
 	return fac;
 }
@@ -55,7 +55,6 @@ shader node_noise_texture(
 	if (use_mapping)
 		p = transform(mapping, p);
 
-	string Basis = "Perlin";
-	Fac = noise(p * Scale, Basis, Distortion, Detail, Fac, Color);
+	Fac = noise(p * Scale, Distortion, Detail, Fac, Color);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
index 5f9cd5afa47..fc2cfdcd55c 100644
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ b/intern/cycles/kernel/shaders/node_texture.h
@@ -14,32 +14,6 @@
  * limitations under the License.
  */
 
-/* Voronoi Distances */
-
-float voronoi_distance(string distance_metric, vector d, float e)
-{
-#if 0
-	if (distance_metric == "Distance Squared")
-#endif
-		return dot(d, d);
-#if 0
-	if (distance_metric == "Actual Distance")
-		return length(d);
-	if (distance_metric == "Manhattan")
-		return fabs(d[0]) + fabs(d[1]) + fabs(d[2]);
-	if (distance_metric == "Chebychev")
-		return max(fabs(d[0]), max(fabs(d[1]), fabs(d[2])));
-	if (distance_metric == "Minkovsky 1/2")
-		return sqrt(fabs(d[0])) + sqrt(fabs(d[1])) + sqrt(fabs(d[1]));
-	if (distance_metric == "Minkovsky 4")
-		return sqrt(sqrt(dot(d * d, d * d)));
-	if (distance_metric == "Minkovsky")
-		return pow(pow(fabs(d[0]), e) + pow(fabs(d[1]), e) + pow(fabs(d[2]), e), 1.0 / e);
-	
-	return 0.0;
-#endif
-}
-
 /* Voronoi / Worley like */
 
 color cellnoise_color(point p)
@@ -51,7 +25,7 @@ color cellnoise_color(point p)
 	return color(r, g, b);
 }
 
-void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
+void voronoi(point p, float e, float da[4], point pa[4])
 {
 	/* returns distances in da and point coords in pa */
 	int xx, yy, zz, xi, yi, zi;
@@ -71,7 +45,7 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 				point ip = point(xx, yy, zz);
 				point vp = (point)cellnoise_color(ip);
 				point pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
+				float d = dot(pd, pd);
 
 				vp += point(xx, yy, zz);
 
@@ -111,46 +85,6 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 	}
 }
 
-float voronoi_Fn(point p, int n)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n];
-}
-
-float voronoi_FnFn(point p, int n1, int n2)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n2] - da[n1];
-}
-
-float voronoi_F1(point p) { return voronoi_Fn(p, 0); }
-float voronoi_F2(point p) { return voronoi_Fn(p, 1); }
-float voronoi_F3(point p) { return voronoi_Fn(p, 2); }
-float voronoi_F4(point p) { return voronoi_Fn(p, 3); }
-float voronoi_F1F2(point p) { return voronoi_FnFn(p, 0, 1); }
-
-float voronoi_Cr(point p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0 * voronoi_F1F2(p);
-	return (t > 1.0) ? 1.0 : t;
-}
-
-float voronoi_F1S(point p) { return 2.0 * voronoi_F1(p) - 1.0; }
-float voronoi_F2S(point p) { return 2.0 * voronoi_F2(p) - 1.0; }
-float voronoi_F3S(point p) { return 2.0 * voronoi_F3(p) - 1.0; }
-float voronoi_F4S(point p) { return 2.0 * voronoi_F4(p) - 1.0; }
-float voronoi_F1F2S(point p) { return 2.0 * voronoi_F1F2(p) - 1.0; }
-float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; }
-
 /* Noise Bases */
 
 float safe_noise(point p, string type)
@@ -172,39 +106,9 @@ float safe_noise(point p, string type)
 	return f;
 }
 
-float noise_basis(point p, string basis)
-{
-	if (basis == "Perlin")
-		return safe_noise(p, "unsigned");
-	if (basis == "Voronoi F1")
-		return voronoi_F1S(p);
-	if (basis == "Voronoi F2")
-		return voronoi_F2S(p);
-	if (basis == "Voronoi F3")
-		return voronoi_F3S(p);
-	if (basis == "Voronoi F4")
-		return voronoi_F4S(p);
-	if (basis == "Voronoi F2-F1")
-		return voronoi_F1F2S(p);
-	if (basis == "Voronoi Crackle")
-		return voronoi_CrS(p);
-	if (basis == "Cell Noise")
-		return cellnoise(p);
-	
-	return 0.0;
-}
-
-/* Soft/Hard Noise */
-
-float noise_basis_hard(point p, string basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard) ? fabs(2.0 * t - 1.0) : t;
-}
-
 /* Turbulence */
 
-float noise_turbulence(point p, string basis, float details, int hard)
+float noise_turbulence(point p, float details, int hard)
 {
 	float fscale = 1.0;
 	float amp = 1.0;
@@ -215,7 +119,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	n = (int)octaves;
 
 	for (i = 0; i <= n; i++) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
@@ -228,7 +132,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	float rmd = octaves - floor(octaves);
 
 	if (rmd != 0.0) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
index df169599d08..29e143ae207 100644
--- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
@@ -37,7 +37,7 @@ shader node_voronoi_texture(
 	float da[4];
 	point pa[4];
 
-	voronoi(p * Scale, "Distance Squared", 1.0, da, pa);
+	voronoi(p * Scale, 1.0, da, pa);
 
 	/* Colored output */
 	if (Coloring == "Intensity") {
diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl
index a95752fc592..569f284cbac 100644
--- a/intern/cycles/kernel/shaders/node_wave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_wave_texture.osl
@@ -31,7 +31,7 @@ float wave(point p, string type, float detail, float distortion, float dscale)
 	}
 
 	if (distortion != 0.0) {
-		n = n + (distortion * noise_turbulence(p * dscale, "Perlin", detail, 0));
+		n = n + (distortion * noise_turbulence(p * dscale, detail, 0));
 	}
 	return 0.5 + 0.5 * sin(n);
 }
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 6babe98717c..697a1756119 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -249,7 +249,21 @@ point rotate (point p, float angle, point a, point b)
 {
     vector axis = normalize (b - a);
     float cosang, sinang;
+    /* Older OSX has major issues with sincos() function,
+     * it's likely a big in OSL or LLVM. For until we've
+     * updated to new versions of this libraries we'll
+     * use a workaround to prevent possible crashes on all
+     * the platforms.
+     *
+     * Shouldn't be that bad because it's mainly used for
+     * anisotropic shader where angle is usually constant.
+     */
+#if 0
     sincos (angle, sinang, cosang);
+#else
+    sinang = sin (angle);
+    cosang = cos (angle);
+#endif
     float cosang1 = 1.0 - cosang;
     float x = axis[0], y = axis[1], z = axis[2];
     matrix M = matrix (x * x + (1.0 - x * x) * cosang,
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
new file mode 100644
index 00000000000..181a1054a0d
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                      |--- ray_state
+ * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
+ * parallel_samples -------------------------------------|                                      |--- PathState_coop
+ * end_sample -------------------------------------------|                                      |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                      |--- rng_coop
+ * rng_state --------------------------------------------|                                      |--- Ray
+ * PathRadiance_coop ------------------------------------|                                      |
+ * sw ---------------------------------------------------|                                      |
+ * sh ---------------------------------------------------|                                      |
+ * sx ---------------------------------------------------|                                      |
+ * sy ---------------------------------------------------|                                      |
+ * stride -----------------------------------------------|                                      |
+ * work_array -------------------------------------------|                                      |--- work_array
+ * queuesize --------------------------------------------|                                      |
+ * start_sample -----------------------------------------|                                      |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                      |
+ * num_samples ------------------------------------------|                                      |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+ccl_device char kernel_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData strucuture */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	ccl_global PathState *state = &PathState_coop[ray_index];
+	PathRadiance *L = L = &PathRadiance_coop[ray_index];
+	ccl_global Ray *ray = &Ray_coop[ray_index];
+	ccl_global float3 *throughput = &throughput_coop[ray_index];
+	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+	ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	ccl_global float *initial_per_sample_output_buffers;
+	ccl_global uint *initial_rng;
+#endif
+	unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+	my_work = work_array[ray_index];
+	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+	get_pixel_tile_position(&pixel_x, &pixel_y,
+	                        &tile_x, &tile_y,
+	                        my_work,
+	                        sw, sh, sx, sy,
+	                        parallel_samples,
+	                        ray_index);
+	my_sample_tile = 0;
+	initial_per_sample_output_buffers = per_sample_output_buffers;
+	initial_rng = rng_state;
+#else  /* __WORK_STEALING__ */
+	sample = work_array[ray_index];
+	int tile_index = ray_index / parallel_samples;
+	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+	tile_x = tile_index % sw;
+	tile_y = tile_index / sw;
+	my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+
+	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		/* eval background shader if nothing hit */
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+			*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+			{
+#ifdef __BACKGROUND__
+				/* sample background shader */
+				float3 L_background = indirect_background(kg, state, ray, sd);
+				path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+		/* accumulate result in output buffer */
+		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		path_rng_end(kg, rng_state, *rng);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+		/* We have completed current work; So get next work */
+		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		if(!valid_work) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#else  /* __WORK_STEALING__ */
+		if((sample + parallel_samples) >= end_sample) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#endif  /* __WORK_STEALING__ */
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+			work_array[ray_index] = my_work;
+			/* Get the sample associated with the current work */
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			/* Get pixel and tile position associated with current work */
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+			my_sample_tile = 0;
+
+			/* Remap rng_state according to the current work */
+			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+			/* Remap per_sample_output_buffers according to the current work */
+			per_sample_output_buffers = initial_per_sample_output_buffers
+				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else  /* __WORK_STEALING__ */
+			work_array[ray_index] = sample + parallel_samples;
+			sample = work_array[ray_index];
+
+			/* Get ray position from ray index */
+			pixel_x = sx + ((ray_index / parallel_samples) % sw);
+			pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif  /* __WORK_STEALING__ */
+
+			/* Initialize random numbers and ray. */
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, L_transparent, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*L_transparent = 0.0f;
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+				debug_data_init(debug_data);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			} else {
+				/* These rays do not participate in path-iteration. */
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				/* Accumulate result in output buffer. */
+				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				path_rng_end(kg, rng_state, *rng);
+
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
new file mode 100644
index 00000000000..2cd98e466c1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                  |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                  |--- Initialized Ray
+ * Un-initialized PathState --------|                                  |--- Initialized PathState
+ * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initilaized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                  |--- Initialized ray_state
+ * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                  |--- Initialized work_array
+ * data ----------------------------|                                  |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                  |
+ * sx ------------------------------|                                  |
+ * sy ------------------------------|                                  |
+ * sw ------------------------------|                                  |
+ * sh ------------------------------|                                  |
+ * stride --------------------------|                                  |
+ * queuesize -----------------------|                                  |
+ * num_samples ---------------------|                                  |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+ccl_device void kernel_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+	kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../kernel_textures.h"
+
+	/* Load ShaderData structure */
+	ShaderData *sd = (ShaderData *)shader_data_sd;
+	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+	sd->P = P_sd;
+	sd_DL_shadow->P = P_sd_DL_shadow;
+
+	sd->N = N_sd;
+	sd_DL_shadow->N = N_sd_DL_shadow;
+
+	sd->Ng = Ng_sd;
+	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+	sd->I = I_sd;
+	sd_DL_shadow->I = I_sd_DL_shadow;
+
+	sd->shader = shader_sd;
+	sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+	sd->flag = flag_sd;
+	sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+	sd->prim = prim_sd;
+	sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+	sd->type = type_sd;
+	sd_DL_shadow->type = type_sd_DL_shadow;
+
+	sd->u = u_sd;
+	sd_DL_shadow->u = u_sd_DL_shadow;
+
+	sd->v = v_sd;
+	sd_DL_shadow->v = v_sd_DL_shadow;
+
+	sd->object = object_sd;
+	sd_DL_shadow->object = object_sd_DL_shadow;
+
+	sd->time = time_sd;
+	sd_DL_shadow->time = time_sd_DL_shadow;
+
+	sd->ray_length = ray_length_sd;
+	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+	sd->ray_depth = ray_depth_sd;
+	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+	sd->transparent_depth = transparent_depth_sd;
+	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+	sd->dP = dP_sd;
+	sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+	sd->dI = dI_sd;
+	sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+	sd->du = du_sd;
+	sd_DL_shadow->du = du_sd_DL_shadow;
+
+	sd->dv = dv_sd;
+	sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+	sd->dPdu = dPdu_sd;
+	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+	sd->dPdv = dPdv_sd;
+	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+#ifdef __OBJECT_MOTION__
+	sd->ob_tfm = ob_tfm_sd;
+	sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
+
+	sd->ob_itfm = ob_itfm_sd;
+	sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
+#endif
+
+	sd->closure = closure_sd;
+	sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+	sd->num_closure = num_closure_sd;
+	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+	sd->randb_closure = randb_closure_sd;
+	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+	sd->ray_P = ray_P_sd;
+	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+	sd->ray_dP = ray_dP_sd;
+	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	/* Initialize work_pool_wgs */
+	if(lid == 0) {
+		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+		work_pool_wgs[group_index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif  /* __WORK_STEALING__ */
+
+	/* Initialize queue data and queue index. */
+	if(thread_index < queuesize) {
+		/* Initialize active ray queue */
+		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize background and buffer update queue */
+		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of AO queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of direct lighting queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	if(thread_index == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		use_queues_flag[0] = 0;
+	}
+
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < (sw * parallel_samples) && y < sh) {
+
+		int ray_index = x + y * (sw * parallel_samples);
+
+		/* This is the first assignment to ray_state;
+		 * So we dont use ASSIGN_RAY_STATE macro.
+		 */
+		ray_state[ray_index] = RAY_ACTIVE;
+
+		unsigned int my_sample;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work = 0;
+		/* Get work. */
+		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		/* Get the sample associated with the work. */
+		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+		my_sample_tile = 0;
+
+		/* Get pixel and tile position associated with the work. */
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		work_array[ray_index] = my_work;
+#else  /* __WORK_STEALING__ */
+		unsigned int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample = my_sample_tile + start_sample;
+
+		/* Initialize work array. */
+		work_array[ray_index] = my_sample ;
+
+		/* Calculate pixel position of this ray. */
+		pixel_x = sx + tile_x;
+		pixel_y = sy + tile_y;
+#endif  /* __WORK_STEALING__ */
+
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+		/* Initialise per_sample_output_buffers to all zeros. */
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+		int per_sample_output_buffers_iterator = 0;
+		for(per_sample_output_buffers_iterator = 0;
+		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
+		    per_sample_output_buffers_iterator++)
+		{
+			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+		}
+
+		/* Initialize random numbers and ray. */
+		kernel_path_trace_setup(kg,
+		                        rng_state,
+		                        my_sample,
+		                        pixel_x, pixel_y,
+		                        &rng_coop[ray_index],
+		                        &Ray_coop[ray_index]);
+
+		if(Ray_coop[ray_index].t != 0.0f) {
+			/* Initialize throuput, L_transparent, Ray, PathState;
+			 * These rays proceed with path-iteration.
+			 */
+			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+			L_transparent_coop[ray_index] = 0.0f;
+			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+			path_state_init(kg,
+			                &PathState_coop[ray_index],
+			                &rng_coop[ray_index],
+			                my_sample,
+			                &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+			debug_data_init(&debugdata_coop[ray_index]);
+#endif
+		} else {
+			/* These rays do not participate in path-iteration. */
+
+			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			/* Accumulate result in output buffer. */
+			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+			path_rng_end(kg, rng_state, rng_coop[ray_index]);
+
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+	}
+
+	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
+	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
new file mode 100644
index 00000000000..50c83d06140
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                             |--- ISLamp_coop
+ * shader_data --------------------------------------|                             |--- LightRay_coop
+ * ray_state ----------------------------------------|                             |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
+ * kg (globals + data) ------------------------------|                             |
+ * queuesize ----------------------------------------|                             |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+ccl_device char kernel_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* Load kernel globals structure and ShaderData structure. */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+
+		/* direct lighting */
+#ifdef __EMISSION__
+		if((kernel_data.integrator.use_direct_light &&
+		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+		{
+			/* Sample illumination from lights to find path contribution. */
+			ccl_global RNG* rng = &rng_coop[ray_index];
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_u, light_v;
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+			LightSample ls;
+			light_sample(kg,
+			             light_t, light_u, light_v,
+			             ccl_fetch(sd, time),
+			             ccl_fetch(sd, P),
+			             state->bounce,
+			             &ls);
+
+			Ray light_ray;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+
+			BsdfEval L_light;
+			bool is_lamp;
+			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp,
+			                   state->bounce, state->transparent_bounce, sd_DL))
+			{
+				/* Write intermediate data to global memory to access from
+				 * the next kernel.
+				 */
+				LightRay_coop[ray_index] = light_ray;
+				BSDFEval_coop[ray_index] = L_light;
+				ISLamp_coop[ray_index] = is_lamp;
+				/* Mark ray state for next shadow kernel. */
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+				enqueue_flag = 1;
+			}
+		}
+#endif  /* __EMISSION__ */
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
new file mode 100644
index 00000000000..a75523a3e53
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                           |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                           |--- ShaderData
+ * ray_state --------------------------------------------|                                                           |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                           |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                           |
+ * sw ---------------------------------------------------|                                                           |
+ * sh ---------------------------------------------------|                                                           |
+ * sx ---------------------------------------------------|                                                           |
+ * sy ---------------------------------------------------|                                                           |
+ * stride -----------------------------------------------|                                                           |
+ * work_array -------------------------------------------|                                                           |
+ * queuesize --------------------------------------------|                                                           |
+ * start_sample -----------------------------------------|                                                           |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index,
+        char *enqueue_flag,
+        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+{
+	/* Load kernel globals structure and ShaderData structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+#endif
+	unsigned int tile_x;
+	unsigned int tile_y;
+	int my_sample_tile;
+	unsigned int sample;
+
+	ccl_global RNG *rng = 0x0;
+	ccl_global PathState *state = 0x0;
+	float3 throughput;
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+		throughput = throughput_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		my_sample_tile = 0;
+#else  /* __WORK_STEALING__ */
+		sample = work_array[ray_index];
+		/* Buffer's stride is "stride"; Find x and y using ray_index. */
+		int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+		per_sample_output_buffers +=
+		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
+		    kernel_data.film.pass_stride;
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) &&
+		   (state->flag & PATH_RAY_CAMERA))
+		{
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+			}
+
+			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				*enqueue_flag = 1;
+			}
+		}
+#endif  /* __HOLDOUT__ */
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+		/* Holdout mask objects do not write data passes. */
+		kernel_write_data_passes(kg,
+		                         per_sample_output_buffers,
+		                         L,
+		                         sd,
+		                         sample,
+		                         state,
+		                         throughput);
+		/* Blurring of bsdf after bounces, for rays that have a small likelihood
+		 * of following this particular path (diffuse, rough glossy.
+		 */
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+			if(blur_pdf < 1.0f) {
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				shader_bsdf_blur(kg, sd, blur_roughness);
+			}
+		}
+
+#ifdef __EMISSION__
+		/* emission */
+		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
+			float3 emission = indirect_primitive_emission(
+			        kg,
+			        sd,
+			        Intersection_coop[ray_index].t,
+			        state->flag,
+			        state->ray_pdf);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		}
+#endif  /* __EMISSION__ */
+
+		/* Path termination. this is a strange place to put the termination, it's
+		 * mainly due to the mixed in MIS that we use. gives too many unneeded
+		 * shader evaluations, only need emission if we are going to terminate.
+		 */
+		float probability = path_state_terminate_probability(kg, state, throughput);
+
+		if(probability == 0.0f) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			*enqueue_flag = 1;
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				if(terminate >= probability) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					*enqueue_flag = 1;
+				} else {
+					throughput_coop[ray_index] = throughput/probability;
+				}
+			}
+		}
+	}
+
+#ifdef __AO__
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion ||
+		   (ccl_fetch(sd, flag) & SD_AO))
+		{
+			/* todo: solve correlation */
+			float bsdf_u, bsdf_v;
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+			float ao_factor = kernel_data.background.ao_factor;
+			float3 ao_N;
+			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+			float3 ao_D;
+			float ao_pdf;
+			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+				Ray _ray;
+				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.D = ao_D;
+				_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+				_ray.time = ccl_fetch(sd, time);
+#endif
+				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dD = differential3_zero();
+				AOLightRay_coop[ray_index] = _ray;
+
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+			}
+		}
+	}
+#endif  /* __AO__ */
+}
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
new file mode 100644
index 00000000000..a8e4b0a06c8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                           |
+ * Intersection_coop ----------------------------------|                           |
+ * ray_state ------------------------------------------|                           |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
+ * queuesize ------------------------------------------|                           |
+ * use_queues_flag ------------------------------------|                           |
+ * sw -------------------------------------------------|                           |
+ * sh -------------------------------------------------|                           |
+ * parallel_samples -----------------------------------|                           |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel.
+ */
+ccl_device void kernel_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+	{
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+
+		float3 throughput = throughput_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+		PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+			/* ray starting from previous non-transparent bounce */
+			Ray light_ray;
+
+			light_ray.P = ray.P - state.ray_t*ray.D;
+			state.ray_t += Intersection_coop[ray_index].t;
+			light_ray.D = ray.D;
+			light_ray.t = state.ray_t;
+			light_ray.time = ray.time;
+			light_ray.dD = ray.dD;
+			light_ray.dP = ray.dP;
+			/* intersect with lamp */
+			float3 emission;
+
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			}
+		}
+#endif  /* __LAMP_MIS__ */
+
+		/* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif  /* __VOLUME_DECOUPLED__ */
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif  /* __VOLUME_SCATTER__ */
+			}
+		}
+#endif  /* __VOLUME__ */
+#endif
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
new file mode 100644
index 00000000000..2dbdabc5fd3
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                 |--- PathState_coop
+ * ray_state --------------------------------------------|                                 |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                 |
+ * kg (globals + data) ----------------------------------|                                 |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+ccl_device char kernel_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData structure. */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+	PathRadiance *L = 0x0;
+	ccl_global PathState *state = 0x0;
+
+	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	 {
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+		float3 _throughput = throughput_coop[ray_index];
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			float3 shadow = LightRay_ao_coop[ray_index].P;
+			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			if(update_path_radiance) {
+				path_radiance_accum_ao(L,
+				                       _throughput,
+				                       AOAlpha_coop[ray_index],
+				                       AOBSDF_coop[ray_index],
+				                       shadow,
+				                       state->bounce);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+			float3 shadow = LightRay_dl_coop[ray_index].P;
+			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			if(update_path_radiance) {
+				BsdfEval L_light = BSDFEval_coop[ray_index];
+				path_radiance_accum_light(L,
+				                          _throughput,
+				                          &L_light,
+				                          shadow,
+				                          1.0f,
+				                          state->bounce,
+				                          ISLamp_coop[ray_index]);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+		ccl_global float3 *throughput = &throughput_coop[ray_index];
+		ccl_global Ray *ray = &Ray_coop[ray_index];
+		ccl_global RNG* rng = &rng_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+
+		/* compute direct lighting and next bounce */
+		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			enqueue_flag = 1;
+		}
+	}
+
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
new file mode 100644
index 00000000000..73aa005a496
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------|                                          |--- Intersection
+ * ray_state --------------------------------------|                                          |--- ray_state
+ * use_queues_flag --------------------------------|                                          |
+ * parallel_samples -------------------------------|                                          |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
+ * kg (data + globals) ----------------------------|                                          |
+ * rng_coop ---------------------------------------|                                          |
+ * sw ---------------------------------------------|                                          |
+ * sh ---------------------------------------------|                                          |
+ * queuesize --------------------------------------|                                          |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_scene_intersect to work on queues.
+ * But during the very first time, the queues will be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+ccl_device void kernel_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	/* All regenerated rays become active here */
+	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+		return;
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	Intersection *isect = &Intersection_coop[ray_index];
+	PathState state = PathState_coop[ray_index];
+	Ray ray = Ray_coop[ray_index];
+
+	/* intersect scene */
+	uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+	RNG rng = rng_coop[ray_index];
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray.dD.dx + ray.dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	if(state.flag & PATH_RAY_CAMERA) {
+		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
+	}
+	debug_data->num_ray_bounces++;
+#endif
+
+	if(!hit) {
+		/* Change the state of rays that hit the background;
+		 * These rays undergo special processing in the
+		 * background_bufferUpdate kernel*/
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
new file mode 100644
index 00000000000..e6fdc592586
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shader_eval kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data
+ * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------|                         |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
+ * ray_state ------------------------------------------|                         |
+ * kg (globals + data) --------------------------------|                         |
+ * queuesize ------------------------------------------|                         |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+ccl_device void kernel_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		Intersection *isect = &Intersection_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      sd,
+		                      isect,
+		                      &ray,
+		                      state->bounce,
+		                      state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
new file mode 100644
index 00000000000..154ec53ffbb
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO  part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
+ * ray_state ---------------------------------------|                            |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * kg (globals + data) -----------------------------|                            |
+ * queuesize ---------------------------------------|                            |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+ccl_device void kernel_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        int total_num_rays,
+        char shadow_blocked_type,
+        int ray_index)
+{
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	{
+		/* Load kernel global structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+		ccl_global Ray *light_ray_global =
+		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
+		                 ? light_ray_ao_global
+		                 : light_ray_dl_global;
+		Intersection *isect_global =
+		        RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+		float3 shadow;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        state,
+		                                        light_ray_global,
+		                                        &shadow,
+		                                        sd_shadow,
+		                                        isect_global));
+
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
new file mode 100644
index 00000000000..e1c7e2cea99
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  __KERNEL_SPLIT_H__
+#define  __KERNEL_SPLIT_H__
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+#include "util_atomic.h"
+
+#include "kernel_random.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
+#include "kernel_shader.h"
+#include "kernel_light.h"
+#include "kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "kernel_volume.h"
+#endif
+
+#include "kernel_path_state.h"
+#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_common.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
+#include "kernel_queues.h"
+#include "kernel_work_stealing.h"
+
+#endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
new file mode 100644
index 00000000000..a21e9b6a0b1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+
+/* Since we process various samples in parallel; The output radiance of different samples
+ * are stored in different locations; This kernel combines the output radiance contributed
+ * by all different samples and stores them in the RenderTile's output buffer.
+ */
+ccl_device void kernel_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < sw && y < sh) {
+		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+		int sample_stride = (data->film.pass_stride);
+
+		int sample_iterator = 0;
+		int pass_stride_iterator = 0;
+		int num_floats = data->film.pass_stride;
+
+		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+				*(buffer + pass_stride_iterator) =
+				        (start_sample == 0 && sample_iterator == 0)
+				                ? *(per_sample_output_buffer + pass_stride_iterator)
+				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+			}
+			per_sample_output_buffer += sample_stride;
+		}
+	}
+}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index d59c9b9e61c..15ac6519780 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -87,7 +87,7 @@ ccl_device_inline int stack_load_int(float *stack, uint a)
 	return __float_as_int(stack[a]);
 }
 
-ccl_device_inline float stack_load_int_default(float *stack, uint a, uint value)
+ccl_device_inline int stack_load_int_default(float *stack, uint a, uint value)
 {
 	return (a == (uint)SVM_STACK_INVALID)? (int)value: stack_load_int(stack, a);
 }
@@ -142,6 +142,8 @@ CCL_NAMESPACE_END
 #include "svm_noise.h"
 #include "svm_texture.h"
 
+#include "svm_math_util.h"
+
 #include "svm_attribute.h"
 #include "svm_gradient.h"
 #include "svm_blackbody.h"
@@ -164,7 +166,6 @@ CCL_NAMESPACE_END
 #include "svm_mapping.h"
 #include "svm_normal.h"
 #include "svm_wave.h"
-#include "svm_math_util.h"
 #include "svm_math.h"
 #include "svm_mix.h"
 #include "svm_ramp.h"
@@ -181,17 +182,20 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
-/* Main Interpreter Loop */
+#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
+#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
 
+/* Main Interpreter Loop */
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = sd->shader & SHADER_MASK;
+	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
 
 		switch(node.x) {
+#if NODES_GROUP(NODE_GROUP_LEVEL_0)
 			case NODE_SHADER_JUMP: {
 				if(type == SHADER_TYPE_SURFACE) offset = node.y;
 				else if(type == SHADER_TYPE_VOLUME) offset = node.z;
@@ -208,15 +212,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_CLOSURE_BACKGROUND:
 				svm_node_closure_background(sd, stack, node);
 				break;
-			case NODE_CLOSURE_HOLDOUT:
-				svm_node_closure_holdout(sd, stack, node);
-				break;
-			case NODE_CLOSURE_AMBIENT_OCCLUSION:
-				svm_node_closure_ambient_occlusion(sd, stack, node);
-				break;
-			case NODE_CLOSURE_VOLUME:
-				svm_node_closure_volume(kg, sd, stack, node, path_flag);
-				break;
 			case NODE_CLOSURE_SET_WEIGHT:
 				svm_node_closure_set_weight(sd, node.y, node.z, node.w);
 				break;
@@ -237,13 +232,137 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				if(stack_load_float(stack, node.z) == 1.0f)
 					offset += node.y;
 				break;
-#ifdef __TEXTURES__
+			case NODE_GEOMETRY:
+				svm_node_geometry(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_CONVERT:
+				svm_node_convert(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_TEX_COORD:
+				svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_VALUE_F:
+				svm_node_value_f(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_VALUE_V:
+				svm_node_value_v(kg, sd, stack, node.y, &offset);
+				break;
+			case NODE_ATTR:
+				svm_node_attr(kg, sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_GEOMETRY_BUMP_DX:
+				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_GEOMETRY_BUMP_DY:
+				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_SET_DISPLACEMENT:
+				svm_node_set_displacement(sd, stack, node.y);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+#  ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
 				svm_node_tex_image(kg, sd, stack, node);
 				break;
 			case NODE_TEX_IMAGE_BOX:
 				svm_node_tex_image_box(kg, sd, stack, node);
 				break;
+			case NODE_TEX_NOISE:
+				svm_node_tex_noise(kg, sd, stack, node, &offset);
+				break;
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+#    if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_SET_BUMP:
+				svm_node_set_bump(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DX:
+				svm_node_attr_bump_dx(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DY:
+				svm_node_attr_bump_dy(kg, sd, stack, node);
+				break;
+			case NODE_TEX_COORD_BUMP_DX:
+				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_TEX_COORD_BUMP_DY:
+				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_CLOSURE_SET_NORMAL:
+				svm_node_set_normal(kg, sd, stack, node.y, node.z);
+				break;
+#    endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+			case NODE_HSV:
+				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_1)
+			case NODE_CLOSURE_HOLDOUT:
+				svm_node_closure_holdout(sd, stack, node);
+				break;
+			case NODE_CLOSURE_AMBIENT_OCCLUSION:
+				svm_node_closure_ambient_occlusion(sd, stack, node);
+				break;
+			case NODE_FRESNEL:
+				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LAYER_WEIGHT:
+				svm_node_layer_weight(sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
+			case NODE_CLOSURE_VOLUME:
+				svm_node_closure_volume(kg, sd, stack, node, path_flag);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+#  ifdef __EXTRA_NODES__
+			case NODE_MATH:
+				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_VECTOR_MATH:
+				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_RGB_RAMP:
+				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+				break;
+			case NODE_GAMMA:
+				svm_node_gamma(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_BRIGHTCONTRAST:
+				svm_node_brightness(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LIGHT_PATH:
+				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
+				break;
+			case NODE_OBJECT_INFO:
+				svm_node_object_info(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_PARTICLE_INFO:
+				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+				break;
+#    ifdef __HAIR__
+#      if NODES_FEATURE(NODE_FEATURE_HAIR)
+			case NODE_HAIR_INFO:
+				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+				break;
+#      endif  /* NODES_FEATURE(NODE_FEATURE_HAIR) */
+#    endif  /* __HAIR__ */
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_2)
+			case NODE_MAPPING:
+				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_MIN_MAX:
+				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_CAMERA:
+				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
+				break;
+#  ifdef __TEXTURES__
 			case NODE_TEX_ENVIRONMENT:
 				svm_node_tex_environment(kg, sd, stack, node);
 				break;
@@ -253,9 +372,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_GRADIENT:
 				svm_node_tex_gradient(sd, stack, node);
 				break;
-			case NODE_TEX_NOISE:
-				svm_node_tex_noise(kg, sd, stack, node, &offset);
-				break;
 			case NODE_TEX_VORONOI:
 				svm_node_tex_voronoi(kg, sd, stack, node, &offset);
 				break;
@@ -274,55 +390,34 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_BRICK:
 				svm_node_tex_brick(kg, sd, stack, node, &offset);
 				break;
-#endif
-			case NODE_CAMERA:
-				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_GEOMETRY:
-				svm_node_geometry(kg, sd, stack, node.y, node.z);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_GEOMETRY_BUMP_DX:
-				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_GEOMETRY_BUMP_DY:
-				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_LIGHT_PATH:
-				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
-				break;
-			case NODE_OBJECT_INFO:
-				svm_node_object_info(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_PARTICLE_INFO:
-				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+			case NODE_NORMAL:
+				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-#ifdef __HAIR__
-			case NODE_HAIR_INFO:
-				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+			case NODE_LIGHT_FALLOFF:
+				svm_node_light_falloff(sd, stack, node);
 				break;
-#endif
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#endif
-			case NODE_CONVERT:
-				svm_node_convert(sd, stack, node.y, node.z, node.w);
+#if NODES_GROUP(NODE_GROUP_LEVEL_3)
+			case NODE_RGB_CURVES:
+				svm_node_rgb_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_F:
-				svm_node_value_f(kg, sd, stack, node.y, node.z);
+			case NODE_VECTOR_CURVES:
+				svm_node_vector_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_V:
-				svm_node_value_v(kg, sd, stack, node.y, &offset);
+			case NODE_TANGENT:
+				svm_node_tangent(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
+			case NODE_NORMAL_MAP:
+				svm_node_normal_map(kg, sd, stack, node);
+				break;
+#  ifdef __EXTRA_NODES__
 			case NODE_INVERT:
 				svm_node_invert(sd, stack, node.y, node.z, node.w);
 				break;
-			case NODE_GAMMA:
-				svm_node_gamma(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_BRIGHTCONTRAST:
-				svm_node_brightness(sd, stack, node.y, node.z, node.w);
-				break;
 			case NODE_MIX:
 				svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
@@ -338,28 +433,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_COMBINE_HSV:
 				svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-			case NODE_HSV:
-				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_ATTR:
-				svm_node_attr(kg, sd, stack, node);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_ATTR_BUMP_DX:
-				svm_node_attr_bump_dx(kg, sd, stack, node);
-				break;
-			case NODE_ATTR_BUMP_DY:
-				svm_node_attr_bump_dy(kg, sd, stack, node);
-				break;
-#endif
-			case NODE_FRESNEL:
-				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_LAYER_WEIGHT:
-				svm_node_layer_weight(sd, stack, node);
+			case NODE_VECTOR_TRANSFORM:
+				svm_node_vector_transform(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
 			case NODE_WIREFRAME:
 				svm_node_wireframe(kg, sd, stack, node);
 				break;
@@ -369,70 +445,20 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_BLACKBODY:
 				svm_node_blackbody(kg, sd, stack, node.y, node.z);
 				break;
-			case NODE_SET_DISPLACEMENT:
-				svm_node_set_displacement(sd, stack, node.y);
-				break;
-			case NODE_SET_BUMP:
-				svm_node_set_bump(kg, sd, stack, node);
-				break;
-			case NODE_MATH:
-				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_MATH:
-				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_TRANSFORM:
-				svm_node_vector_transform(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL:
-				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_MAPPING:
-				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_MIN_MAX:
-				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_TEX_COORD:
-				svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_TEX_COORD_BUMP_DX:
-				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
-				break;
-			case NODE_TEX_COORD_BUMP_DY:
-				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
-				break;
-			case NODE_CLOSURE_SET_NORMAL:
-				svm_node_set_normal(kg, sd, stack, node.y, node.z );
-				break;
-			case NODE_RGB_RAMP:
-				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
-				break;
-			case NODE_RGB_CURVES:
-				svm_node_rgb_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_VECTOR_CURVES:
-				svm_node_vector_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_LIGHT_FALLOFF:
-				svm_node_light_falloff(sd, stack, node);
-				break;
-#endif
-			case NODE_TANGENT:
-				svm_node_tangent(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL_MAP:
-				svm_node_normal_map(kg, sd, stack, node);
-				break;	
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
 			case NODE_END:
+				return;
 			default:
+				kernel_assert(!"Unknown node type was passed to the SVM machine");
 				return;
 		}
 	}
 }
 
+#undef NODES_GROUP
+#undef NODES_FEATURE
+
 CCL_NAMESPACE_END
 
 #endif /* __SVM_H__ */
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index b63978b6e1f..025ae96f59d 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
-	if(sd->object != OBJECT_NONE) {
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
 		/* find attribute by unique id */
 		uint id = node.y;
-		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+		uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-		attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+		attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 		
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 1e40e868e14..b750ad87b7f 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -36,48 +36,12 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
 {
-	/* Output */
-	float3 color_rgb = make_float3(0.0f, 0.0f, 0.0f);
-
 	/* Input */
 	float temperature = stack_load_float(stack, temperature_offset);
 
-	if (temperature < BB_DRAPER) {
-		/* just return very very dim red */
-		color_rgb = make_float3(1.0e-6f,0.0f,0.0f);
-	}
-	else if (temperature <= BB_MAX_TABLE_RANGE) {
-		/* This is the overall size of the table */
-		const int lookuptablesize = 956;
-		const float lookuptablenormalize = 1.0f/956.0f;
-
-		/* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors
-		just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */
-		float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
-
-		int blackbody_table_offset = kernel_data.tables.blackbody_offset;
-
-		/* Retrieve colors from the lookup table */
-		float lutval = t*lookuptablenormalize;
-		float R = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*1.0f)*lookuptablenormalize;
-		float G = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*2.0f)*lookuptablenormalize;
-		float B = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-
-		R = powf(R, BB_TABLE_YPOWER);
-		G = powf(G, BB_TABLE_YPOWER);
-		B = powf(B, BB_TABLE_YPOWER);
-
-		color_rgb = make_float3(R, G, B);
-	}
-
-	/* Luminance */
-	float l = linear_rgb_to_gray(color_rgb);
-	if (l != 0.0f)
-		color_rgb /= l;
+	float3 color_rgb = svm_math_blackbody_color(temperature);
 
-	if (stack_valid(col_offset))
+	if(stack_valid(col_offset))
 		stack_store_float3(stack, col_offset, color_rgb);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 33a2a5c7598..fcf8f47b77e 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -47,7 +47,7 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
 	y = p.y - row_height*rownum;
 
 	return make_float2(
-		clamp((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias), 0.0f, 1.0f),
+		saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)),
 
 		(x < mortar_size || y < mortar_size ||
 		x > (brick_width - mortar_size) ||
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 631bd1825ee..e4d545a00ae 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -32,7 +32,7 @@ ccl_device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color,
 	color.y = max(a*color.y + b, 0.0f);
 	color.z = max(a*color.z + b, 0.0f);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index e03745cb331..00678a49d70 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,17 +23,17 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, sd->P);
+	vector = transform_point(&tfm, ccl_fetch(sd, P));
 	zdepth = vector.z;
 	distance = len(vector);
 
-	if (stack_valid(out_vector))
+	if(stack_valid(out_vector))
 		stack_store_float3(stack, out_vector, normalize(vector));
 
-	if (stack_valid(out_zdepth))
+	if(stack_valid(out_zdepth))
 		stack_store_float(stack, out_zdepth, zdepth);
 
-	if (stack_valid(out_distance))
+	if(stack_valid(out_distance))
 		stack_store_float(stack, out_distance, distance);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 07ac7104e68..20a6cb8cd45 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,10 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 			sc->data0 = eta;
 			sc->data1 = 0.0f;
 			sc->data2 = 0.0f;
-			sd->flag |= bsdf_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
+		}
+		else {
+			sc->data0 = 0.0f;
+			sc->data1 = 0.0f;
+			ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 		}
-		else
-			sd->flag |= bsdf_reflection_setup(sc);
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
 		sc->data0 = roughness;
@@ -36,9 +39,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 	}
 	else {
 		sc->data0 = roughness;
@@ -46,23 +49,23 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_ggx_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 	}
 }
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 
-	if(sd->num_closure < MAX_CLOSURE) {
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight *= mix_weight;
 		sc->type = type;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 		return sc;
 	}
 
@@ -71,14 +74,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = sc->weight * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -90,14 +94,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -121,7 +126,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -139,13 +144,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_diffuse_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
 				}
 				else {
 					sc->data0 = roughness;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_oren_nayar_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
 				}
 			}
 			break;
@@ -158,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_translucent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
 			}
 			break;
 		}
@@ -170,7 +175,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_transparent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 			}
 			break;
 		}
@@ -192,13 +197,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					sd->flag |= bsdf_reflection_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					sd->flag |= bsdf_microfacet_ggx_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
 			}
 
 			break;
@@ -216,7 +221,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -224,7 +229,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
 
-					sd->flag |= bsdf_refraction_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
 				}
 				else {
 					sc->data0 = param1;
@@ -232,9 +237,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data2 = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 					else
-						sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 				}
 			}
 
@@ -251,15 +256,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, sd->I);
+			float cosNO = dot(N, ccl_fetch(sd, I));
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
 			/* reflection */
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight;
 			float sample_weight = sc->sample_weight;
 
@@ -280,15 +285,17 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 			/* refraction */
-			sc = &sd->closure[sd->num_closure];
-			sc->weight = weight;
-			sc->sample_weight = sample_weight;
+			if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+				sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+				sc->weight = weight;
+				sc->sample_weight = sample_weight;
 
-			sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
+				sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
 
-			if(sc) {
-				sc->N = N;
-				svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				if(sc) {
+					sc->N = N;
+					svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				}
 			}
 
 			break;
@@ -328,12 +335,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				sc->data2 = 0.0f;
 
-				if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
-					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
-				else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
-					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
+				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
 			}
 			break;
 		}
@@ -344,10 +351,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				/* sigma */
-				sc->data0 = clamp(param1, 0.0f, 1.0f);
+				sc->data0 = saturate(param1);
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
-				sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
 			}
 			break;
 		}
@@ -362,10 +369,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = param2;
 				sc->data2 = 0.0f;
 				
-				if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					sd->flag |= bsdf_diffuse_toon_setup(sc);
+				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
+					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
 				else
-					sd->flag |= bsdf_glossy_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
 			}
 			break;
 		}
@@ -373,7 +380,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
 			
-			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -384,11 +391,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * spawned by transmission from the front */
 					sc->weight = make_float3(1.0f, 1.0f, 1.0f);
 					sc->N = N;
-					sd->flag |= bsdf_transparent_setup(sc);
+					sc->data0 = 0.0f;
+					sc->data1 = 0.0f;
+					ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 				}
 			}
 			else {
-				ShaderClosure *sc = &sd->closure[sd->num_closure];
+				ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 				sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -397,18 +406,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = param2;
 					sc->data2 = -stack_load_float(stack, data_node.z);
 
-					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
-						sc->T = normalize(sd->dPdv);
+					if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
+						sc->T = normalize(ccl_fetch(sd, dPdv));
 						sc->data2 = 0.0f;
 					}
 					else
-						sc->T = normalize(sd->dPdu);
+						sc->T = normalize(ccl_fetch(sd, dPdu));
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						sd->flag |= bsdf_hair_reflection_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
 					}
 					else {
-						sd->flag |= bsdf_hair_transmission_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
 					}
 				}
 			}
@@ -418,9 +427,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 #ifdef __SUBSURFACE__
+#ifndef __SPLIT_KERNEL__
+#  define sc_next(sc) sc++
+#  else
+#  define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
+#  endif
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID: {
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
@@ -430,7 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;
 
-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
 				/* radius * scale */
 				float3 radius = stack_load_float3(stack, data_node.z)*param1;
 				/* sharpness */
@@ -450,10 +464,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.y) > 0.0f) {
@@ -467,10 +481,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.z) > 0.0f) {
@@ -484,15 +498,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 			}
 
 			break;
 		}
+#  undef sc_next
 #endif
 		default:
 			break;
@@ -520,7 +535,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);
 
 			if(sc) {
-				sd->flag |= volume_absorption_setup(sc);
+				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
 			}
 			break;
 		}
@@ -529,7 +544,8 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 			if(sc) {
 				sc->data0 = param2; /* g */
-				sd->flag |= volume_henyey_greenstein_setup(sc);
+				sc->data1 = 0.0f;
+				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
 			}
 			break;
 		}
@@ -554,7 +570,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
 
-	sd->flag |= SD_EMISSION;
+	ccl_fetch(sd, flag) |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -588,7 +604,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
 
-	sd->flag |= SD_HOLDOUT;
+	ccl_fetch(sd, flag) |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -606,15 +622,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
 
-	sd->flag |= SD_AO;
+	ccl_fetch(sd, flag) |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	if(sd->num_closure < MAX_CLOSURE)
-		sd->closure[sd->num_closure].weight = weight;
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+		sc->weight = weight;
+	}
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -649,7 +667,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset);
 
 	float weight = stack_load_float(stack, weight_offset);
-	weight = clamp(weight, 0.0f, 1.0f);
+	weight = saturate(weight);
 
 	float in_weight = (stack_valid(in_weight_offset))? stack_load_float(stack, in_weight_offset): 1.0f;
 
@@ -664,7 +682,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	sd->N = normal;
+	ccl_fetch(sd, N) = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 4a058905a93..8d4b07c9973 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	/* get surface tangents from normal */
-	float3 Rx = cross(sd->dP.dy, normal_in);
-	float3 Ry = cross(normal_in, sd->dP.dx);
+	float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in);
+	float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx);
 
 	/* get bump values */
 	uint c_offset, x_offset, y_offset, strength_offset;
@@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	float h_y = stack_load_float(stack, y_offset);
 
 	/* compute surface gradient and determinant */
-	float det = dot(sd->dP.dx, Rx);
+	float det = dot(ccl_fetch(sd, dP).dx, Rx);
 	float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry;
 
 	float absdet = fabsf(det);
@@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
 {
 	float d = stack_load_float(stack, fac_offset);
-	sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
+	ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 3703ec55015..23c97d80cb0 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(sd->I, normal_in));
+		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 8bc59b9c673..b645ff3f0f9 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -21,14 +21,14 @@ ccl_device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint
 	float3 color = stack_load_float3(stack, in_color);
 	float gamma = stack_load_float(stack, in_gamma);
 
-	if (color.x > 0.0f)
+	if(color.x > 0.0f)
 		color.x = powf(color.x, gamma);
-	if (color.y > 0.0f)
+	if(color.y > 0.0f)
 		color.y = powf(color.y, gamma);
-	if (color.z > 0.0f)
+	if(color.z > 0.0f)
 		color.z = powf(color.z, gamma);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index efbefa77d28..bb06254c3a9 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P; break;
-		case NODE_GEOM_N: data = sd->N; break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
+		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = sd->I; break;
-		case NODE_GEOM_Ng: data = sd->Ng; break;
+		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
+		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
 #endif
 	}
 
@@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
 		default: data = 0.0f; break;
 	}
 
@@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = sd->curve_transparency;
+			data = ccl_fetch(sd, curve_transparency);
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index a5e385faddc..53d7b4f812c 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -66,7 +66,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 	float3 co = stack_load_float3(stack, co_offset);
 
 	float f = svm_gradient(co, (NodeGradientType)type);
-	f = clamp(f, 0.0f, 1.0f);
+	f = saturate(f);
 
 	if(stack_valid(fac_offset))
 		stack_store_float(stack, fac_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index eeb4ba25e91..1f2cad60df7 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -46,12 +46,12 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 	color.y = fac*color.y + (1.0f - fac)*in_color.y;
 	color.z = fac*color.z + (1.0f - fac)*in_color.z;
 
-	/* Clamp color to prevent negative values cauzed by oversaturation. */
+	/* Clamp color to prevent negative values caused by oversaturation. */
 	color.x = max(color.x, 0.0f);
 	color.y = max(color.y, 0.0f);
 	color.z = max(color.z, 0.0f);
 
-	if (stack_valid(out_color_offset))
+	if(stack_valid(out_color_offset))
 		stack_store_float3(stack, out_color_offset, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 4de69479bd9..caf0b37ba35 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -65,7 +65,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 	float4 r;
 	int ix, iy, nix, niy;
-	if (interpolation == INTERPOLATION_CLOSEST) {
+	if(interpolation == INTERPOLATION_CLOSEST) {
 		svm_image_texture_frac(x*width, &ix);
 		svm_image_texture_frac(y*height, &iy);
 
@@ -251,9 +251,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
 		case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
 		case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
-		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
 		case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
 		case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
@@ -392,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = sd->N;
+	float3 N = ccl_fetch(sd, N);
 
-	N = sd->N;
-	if(sd->object != OBJECT_NONE)
+	N = ccl_fetch(sd, N);
+	if(ccl_fetch(sd, object) != OBJECT_NONE)
 		object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
@@ -433,17 +433,17 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 		/* in case of blending, test for mixes between two textures */
 		if(N.z < (1.0f - limit)*(N.y + N.x)) {
 			weight.x = N.x/(N.x + N.y);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.y = 1.0f - weight.x;
 		}
 		else if(N.x < (1.0f - limit)*(N.y + N.z)) {
 			weight.y = N.y/(N.y + N.z);
-			weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.y = saturate((weight.y - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.y;
 		}
 		else if(N.y < (1.0f - limit)*(N.x + N.z)) {
 			weight.x = N.x/(N.x + N.z);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.x;
 		}
 		else {
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 152b49174e0..5ce858e2e5d 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -30,7 +30,7 @@ ccl_device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint
 	color.y = invert(color.y, factor);
 	color.z = invert(color.z, factor);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 677d139c5d4..a235dd35224 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = sd->ray_length; break;
-		case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
-		case NODE_LP_ray_transparent: info = sd->transparent_depth; break;
+		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break;
+		case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break;
 	}
 
 	stack_store_float(stack, out_offset, info);
@@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = sd->ray_length*sd->ray_length;
+		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
 		strength *= squared/(smooth + squared);
 	}
 
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 39cc14d5e8e..645cbd3fc73 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -97,12 +97,74 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
 	else if(type == NODE_MATH_ABSOLUTE)
 		Fac = fabsf(Fac1);
 	else if(type == NODE_MATH_CLAMP)
-		Fac = clamp(Fac1, 0.0f, 1.0f);
+		Fac = saturate(Fac1);
 	else
 		Fac = 0.0f;
 	
 	return Fac;
 }
 
+ccl_device float3 svm_math_blackbody_color(float t) {
+	/* Calculate color in range 800..12000 using an approximation
+	 * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
+	 * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
+	 * which is enough to get the same 8 bit/channel color.
+	 */
+
+	const float rc[6][3] = {
+		{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
+		{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
+		{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
+		{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
+		{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
+		{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
+	};
+
+	const float gc[6][3] = {
+		{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
+		{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
+		{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
+		{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
+		{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
+		{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
+	};
+
+	const float bc[6][4] = {
+		{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
+		{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
+		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
+	};
+
+	if(t >= 12000.0f)
+		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+
+	/* Define a macro to reduce stack usage for nvcc */
+#define MAKE_BB_RGB(i) make_float3(\
+		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
+		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
+		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
+
+	if(t >= 6365.0f)
+		return MAKE_BB_RGB(5);
+	if(t >= 3315.0f)
+		return MAKE_BB_RGB(4);
+	if(t >= 1902.0f)
+		return MAKE_BB_RGB(3);
+	if(t >= 1449.0f)
+		return MAKE_BB_RGB(2);
+	if(t >= 1167.0f)
+		return MAKE_BB_RGB(1);
+	if(t >= 965.0f)
+		return MAKE_BB_RGB(0);
+
+#undef MAKE_BB_RGB
+
+	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+	return make_float3(4.70366907f, 0.0f, 0.0f);
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index b6b1966cb3b..6111214acba 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -254,16 +254,16 @@ ccl_device float3 svm_mix_clamp(float3 col)
 {
 	float3 outcol = col;
 
-	outcol.x = clamp(col.x, 0.0f, 1.0f);
-	outcol.y = clamp(col.y, 0.0f, 1.0f);
-	outcol.z = clamp(col.z, 0.0f, 1.0f);
+	outcol.x = saturate(col.x);
+	outcol.y = saturate(col.y);
+	outcol.z = saturate(col.z);
 
 	return outcol;
 }
 
 ccl_device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
-	float t = clamp(fac, 0.0f, 1.0f);
+	float t = saturate(fac);
 
 	switch(type) {
 		case NODE_MIX_BLEND: return svm_mix_blend(t, c1, c2);
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 2f81ddaa74c..09eba31945e 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * from "Texturing and Modelling: A procedural approach"
  */
 
-ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0f;
@@ -53,7 +53,7 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, flo
  * octaves: number of frequencies in the fBm
  */
 
-ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0f;
@@ -82,7 +82,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset)
+ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -117,7 +117,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -154,7 +154,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNois
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = powf(lacunarity, -H);
@@ -168,7 +168,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 	for(i = 1; i < float_to_int(octaves); i++) {
 		p *= lacunarity;
-		weight = clamp(signal * gain, 0.0f, 1.0f);
+		weight = saturate(signal * gain);
 		signal = offset - fabsf(snoise(p));
 		signal *= signal;
 		signal *= weight;
@@ -183,18 +183,16 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 ccl_device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float3 p)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
-
 	if(type == NODE_MUSGRAVE_MULTIFRACTAL)
-		return intensity*noise_musgrave_multi_fractal(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_FBM)
-		return intensity*noise_musgrave_fBm(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_HYBRID_MULTIFRACTAL)
-		return intensity*noise_musgrave_hybrid_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_RIDGED_MULTIFRACTAL)
-		return intensity*noise_musgrave_ridged_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_HETERO_TERRAIN)
-		return intensity*noise_musgrave_hetero_terrain(p, basis, dimension, lacunarity, octaves, offset);
+		return intensity*noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, offset);
 	
 	return 0.0f;
 }
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index eccd119b74f..62ff38cf1c5 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -20,23 +20,22 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
 	int hard = 0;
 
 	if(distortion != 0.0f) {
 		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
 
-		r.x = noise_basis(p + offset, basis) * distortion;
-		r.y = noise_basis(p, basis) * distortion;
-		r.z = noise_basis(p - offset, basis) * distortion;
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
 		p += r;
 	}
 
-	*fac = noise_turbulence(p, basis, detail, hard);
+	*fac = noise_turbulence(p, detail, hard);
 	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), basis, detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), basis, detail, hard));
+		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 }
 
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 67b5321e0de..53abef71012 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -28,10 +28,10 @@ ccl_device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack,
 	direction.z = __int_as_float(node1.z);
 	direction = normalize(direction);
 
-	if (stack_valid(out_normal_offset))
+	if(stack_valid(out_normal_offset))
 		stack_store_float3(stack, out_normal_offset, direction);
 
-	if (stack_valid(out_dot_offset))
+	if(stack_valid(out_dot_offset))
 		stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
 }
 
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 998f649a571..062ab013b1f 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate)
 {
-	f = clamp(f, 0.0f, 1.0f)*(RAMP_TABLE_SIZE-1);
+	f = saturate(f)*(RAMP_TABLE_SIZE-1);
 
 	/* clamp int as well in case of NaN */
 	int i = clamp(float_to_int(f), 0, RAMP_TABLE_SIZE-1);
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 68f9fea02f0..6f51b163756 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -28,7 +28,7 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *s
 	/* Combine, and convert back to RGB */
 	float3 color = hsv_to_rgb(make_float3(hue, saturation, value));
 
-	if (stack_valid(color_out))
+	if(stack_valid(color_out))
 		stack_store_float3(stack, color_out, color);
 }
 
@@ -42,11 +42,11 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *
 	/* Convert to HSV */
 	color = rgb_to_hsv(color);
 
-	if (stack_valid(hue_out))
+	if(stack_valid(hue_out))
 		stack_store_float(stack, hue_out, color.x);
-	if (stack_valid(saturation_out))
+	if(stack_valid(saturation_out))
 		stack_store_float(stack, saturation_out, color.y);
-	if (stack_valid(value_out))
+	if(stack_valid(value_out))
 		stack_store_float(stack, value_out, color.z);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
index 7a5a69f6dff..63570dd6942 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_vector.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
@@ -22,7 +22,7 @@ ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_of
 {
 	float vector = stack_load_float(stack, in_offset);
 
-	if (stack_valid(out_offset))
+	if(stack_valid(out_offset))
 		stack_store_float(stack, out_offset+vector_index, vector);
 }
 
@@ -30,10 +30,10 @@ ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivec
 {
 	float3 vector = stack_load_float3(stack, ivector_offset);
 
-	if (stack_valid(out_offset)) {
-		if (vector_index == 0)
+	if(stack_valid(out_offset)) {
+		if(vector_index == 0)
 			stack_store_float(stack, out_offset, vector.x);
-		else if (vector_index == 1)
+		else if(vector_index == 1)
 			stack_store_float(stack, out_offset, vector.y);
 		else
 			stack_store_float(stack, out_offset, vector.z);
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index a399acf3c0f..eebd9bee420 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -48,48 +48,48 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P));
 			else
-				data = transform_point(&tfm, sd->P + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -113,9 +113,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -130,48 +130,48 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dx);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -198,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
 			if(node.w == 0) {
-				if(sd->object != OBJECT_NONE) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -215,48 +215,48 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dy);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -281,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(sd->object == OBJECT_NONE) {
+		if(ccl_fetch(sd, object) == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -302,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
 		float3 normal;
 
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
 		}
 		else {
-			normal = sd->Ng;
+			normal = ccl_fetch(sd, Ng);
 			object_inverse_normal_transform(kg, sd, &normal);
 		}
 
@@ -337,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = normalize(sd->N + (N - sd->N)*strength);
+		N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -367,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(attr_offset == ATTR_STD_NOT_FOUND)
-			generated = sd->P;
+			generated = ccl_fetch(sd, P);
 		else
 			generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 
@@ -380,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
+	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index c5dc213c82d..dcb00f7dd55 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -16,261 +16,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Voronoi Distances */
-
-#if 0
-ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
-{
-#if 0
-	if(distance_metric == NODE_VORONOI_DISTANCE_SQUARED)
-#endif
-		return dot(d, d);
-#if 0
-	if(distance_metric == NODE_VORONOI_ACTUAL_DISTANCE)
-		return len(d);
-	if(distance_metric == NODE_VORONOI_MANHATTAN)
-		return fabsf(d.x) + fabsf(d.y) + fabsf(d.z);
-	if(distance_metric == NODE_VORONOI_CHEBYCHEV)
-		return fmaxf(fabsf(d.x), fmaxf(fabsf(d.y), fabsf(d.z)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_H)
-		return sqrtf(fabsf(d.x)) + sqrtf(fabsf(d.y)) + sqrtf(fabsf(d.y));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_4)
-		return sqrtf(sqrtf(dot(d*d, d*d)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY)
-		return powf(powf(fabsf(d.x), e) + powf(fabsf(d.y), e) + powf(fabsf(d.z), e), 1.0f/e);
-	
-	return 0.0f;
-#endif
-}
-
-/* Voronoi / Worley like */
-ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
-{
-	float da[4];
-	float3 pa[4];
-	NodeDistanceMetric distance_metric = NODE_VORONOI_DISTANCE_SQUARED;
-
-	/* returns distances in da and point coords in pa */
-	int xx, yy, zz, xi, yi, zi;
-
-	xi = floor_to_int(p.x);
-	yi = floor_to_int(p.y);
-	zi = floor_to_int(p.z);
-
-	da[0] = 1e10f;
-	da[1] = 1e10f;
-	da[2] = 1e10f;
-	da[3] = 1e10f;
-
-	pa[0] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[1] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[2] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[3] = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(xx = xi-1; xx <= xi+1; xx++) {
-		for(yy = yi-1; yy <= yi+1; yy++) {
-			for(zz = zi-1; zz <= zi+1; zz++) {
-				float3 ip = make_float3((float)xx, (float)yy, (float)zz);
-				float3 vp = cellnoise_color(ip);
-				float3 pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
-
-				vp += ip;
-
-				if(d < da[0]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = da[0];
-					da[0] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = pa[0];
-					pa[0] = vp;
-				}
-				else if(d < da[1]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = vp;
-				}
-				else if(d < da[2]) {
-					da[3] = da[2];
-					da[2] = d;
-
-					pa[3] = pa[2];
-					pa[2] = vp;
-				}
-				else if(d < da[3]) {
-					da[3] = d;
-					pa[3] = vp;
-				}
-			}
-		}
-	}
-
-	float4 result = make_float4(pa[n1].x, pa[n1].y, pa[n1].z, da[n1]);
-
-	if(n2 != -1)
-		result = make_float4(pa[n2].x, pa[n2].y, pa[n2].z, da[n2]) - result;
-
-	return result;
-}
-#endif
-
-ccl_device float voronoi_F1_distance(float3 p)
-{
-	/* returns squared distance in da */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#else
-	ssef vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#endif
-
-	return da;
-}
-
-ccl_device float3 voronoi_F1_color(float3 p)
-{
-	/* returns color of the nearest point */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	float3 pa;
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	return cellnoise_color(pa);
-#else
-	ssef pa, vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	ssef color = cellnoise_color(pa);
-	return (float3 &)color;
-#endif
-}
-
-#if 0
-ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
-ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
-ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
-ccl_device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; }
-ccl_device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; }
-
-ccl_device float voronoi_Cr(float3 p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0f*voronoi_F1F2(p);
-	return (t > 1.0f)? 1.0f: t;
-}
-
-ccl_device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; }
-ccl_device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; }
-ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
-ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
-ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
-ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
-#endif
-
-/* Noise Bases */
-
-ccl_device float noise_basis(float3 p, NodeNoiseBasis basis)
-{
-	/* Only Perlin enabled for now, others break CUDA compile by making kernel
-	 * too big, with compile using > 4GB, due to everything being inlined. */
-
-#if 0
-	if(basis == NODE_NOISE_PERLIN)
-#endif
-		return noise(p);
-#if 0
-	if(basis == NODE_NOISE_VORONOI_F1)
-		return voronoi_F1S(p);
-	if(basis == NODE_NOISE_VORONOI_F2)
-		return voronoi_F2S(p);
-	if(basis == NODE_NOISE_VORONOI_F3)
-		return voronoi_F3S(p);
-	if(basis == NODE_NOISE_VORONOI_F4)
-		return voronoi_F4S(p);
-	if(basis == NODE_NOISE_VORONOI_F2_F1)
-		return voronoi_F1F2S(p);
-	if(basis == NODE_NOISE_VORONOI_CRACKLE)
-		return voronoi_CrS(p);
-	if(basis == NODE_NOISE_CELL_NOISE)
-		return cellnoise(p);
-	
-	return 0.0f;
-#endif
-}
-
-/* Soft/Hard Noise */
-
-ccl_device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard)? fabsf(2.0f*t - 1.0f): t;
-}
-
 /* Turbulence */
 
-ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard)
+ccl_device_noinline float noise_turbulence(float3 p, float octaves, int hard)
 {
 	float fscale = 1.0f;
 	float amp = 1.0f;
@@ -281,7 +29,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	n = float_to_int(octaves);
 
 	for(i = 0; i <= n; i++) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
@@ -294,7 +42,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	float rmd = octaves - floorf(octaves);
 
 	if(rmd != 0.0f) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 7130b14a426..009e91192eb 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -28,6 +28,29 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
+/* Known frequencies of used nodes, used for selective nodes compilation
+ * in the kernel. Currently only affects split OpenCL kernel.
+ *
+ * Keep as defines so it's easy to check which nodes are to be compiled
+ * from preprocessor.
+ *
+ * Lower the number of group more often the node is used.
+ */
+#define NODE_GROUP_LEVEL_0    0
+#define NODE_GROUP_LEVEL_1    1
+#define NODE_GROUP_LEVEL_2    2
+#define NODE_GROUP_LEVEL_3    3
+#define NODE_GROUP_LEVEL_MAX  NODE_GROUP_LEVEL_3
+
+#define NODE_FEATURE_VOLUME     (1 << 0)
+#define NODE_FEATURE_HAIR       (1 << 1)
+#define NODE_FEATURE_BUMP       (1 << 2)
+/* TODO(sergey): Consider using something like ((uint)(-1)).
+ * Need to ceck carefully operand types around usage of this
+ * define first.
+ */
+#define NODE_FEATURE_ALL        (NODE_FEATURE_VOLUME|NODE_FEATURE_HAIR|NODE_FEATURE_BUMP)
+
 typedef enum NodeType {
 	NODE_END = 0,
 	NODE_CLOSURE_BSDF,
@@ -256,27 +279,6 @@ typedef enum NodeConvert {
 	NODE_CONVERT_IV
 } NodeConvert;
 
-typedef enum NodeDistanceMetric {
-	NODE_VORONOI_DISTANCE_SQUARED,
-	NODE_VORONOI_ACTUAL_DISTANCE,
-	NODE_VORONOI_MANHATTAN,
-	NODE_VORONOI_CHEBYCHEV,
-	NODE_VORONOI_MINKOVSKY_H,
-	NODE_VORONOI_MINKOVSKY_4,
-	NODE_VORONOI_MINKOVSKY
-} NodeDistanceMetric;
-
-typedef enum NodeNoiseBasis {
-	NODE_NOISE_PERLIN,
-	NODE_NOISE_VORONOI_F1,
-	NODE_NOISE_VORONOI_F2,
-	NODE_NOISE_VORONOI_F3,
-	NODE_NOISE_VORONOI_F4,
-	NODE_NOISE_VORONOI_F2_F1,
-	NODE_NOISE_VORONOI_CRACKLE,
-	NODE_NOISE_CELL_NOISE
-} NodeNoiseBasis;
-
 typedef enum NodeMusgraveType {
 	NODE_MUSGRAVE_MULTIFRACTAL,
 	NODE_MUSGRAVE_FBM,
@@ -426,6 +428,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index a16786f3ed3..4c32130d06d 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (sd->object != OBJECT_NONE);
+	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
@@ -45,7 +45,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 			else
 				in = transform_point(&tfm, in);
 		}
-		else if (to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
+		else if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
 			if(is_direction)
 				object_inverse_dir_transform(kg, sd, &in);
 			else
@@ -54,7 +54,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	}
 	
 	/* From camera */
-	else if (from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
+	else if(from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
 		if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_WORLD || to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT) {
 			tfm = kernel_data.cam.cameratoworld;
 			if(is_direction)
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index 5a2e6e97dd3..d612d7e973f 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -18,6 +18,92 @@ CCL_NAMESPACE_BEGIN
 
 /* Voronoi */
 
+ccl_device float voronoi_F1_distance(float3 p)
+{
+	/* returns squared distance in da */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#else
+	ssef vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#endif
+
+	return da;
+}
+
+ccl_device float3 voronoi_F1_color(float3 p)
+{
+	/* returns color of the nearest point */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	float3 pa;
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	return cellnoise_color(pa);
+#else
+	ssef pa, vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	ssef color = cellnoise_color(pa);
+	return (float3 &)color;
+#endif
+}
+
 ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p)
 {
 	if(coloring == NODE_VORONOI_INTENSITY) {
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 36b59c3684c..6eaddaf301c 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -28,7 +28,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, float3 p, float detail, fl
 		n = len(p) * 20.0f;
 	
 	if(distortion != 0.0f)
-		n += distortion * noise_turbulence(p*dscale, NODE_NOISE_PERLIN, detail, 0);
+		n += distortion * noise_turbulence(p*dscale, detail, 0);
 
 	return 0.5f + 0.5f * sinf(n);
 }
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index 9e57c470c0f..57030f3979d 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -77,7 +77,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	int i = float_to_int(ii);
 	float3 color;
 	
-	if (i < 0 || i >= 80) {
+	if(i < 0 || i >= 80) {
 		color = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else {
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 42fe3e8e429..30ccd523add 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device float wireframe(KernelGlobals *kg,
                            float3 *P)
 {
 #ifdef __HAIR__
-	if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if (sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(sd->type & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, sd->prim, Co);
+		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
 		else
-			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
+			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
-			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -76,7 +76,7 @@ ccl_device float wireframe(KernelGlobals *kg,
 		// other half. And take the square for fast comparison
 		pixelwidth *= 0.5f * size;
 		pixelwidth *= pixelwidth;
-		for (int i = 0; i < np; i++) {
+		for(int i = 0; i < np; i++) {
 			int i2 = i ? i - 1 : np - 1;
 			float3 dir = *P - Co[i];
 			float3 edge = Co[i] - Co[i2];
@@ -84,7 +84,7 @@ ccl_device float wireframe(KernelGlobals *kg,
 			// At this point dot(crs, crs) / dot(edge, edge) is
 			// the square of area / length(edge) == square of the
 			// distance to the edge.
-			if (dot(crs, crs) < (dot(edge, edge) * pixelwidth))
+			if(dot(crs, crs) < (dot(edge, edge) * pixelwidth))
 				return 1.0f;
 		}
 	}
@@ -106,19 +106,30 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	int pixel_size = (int)use_pixel_size;
 
 	/* Calculate wireframe */
-	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
+#ifdef __SPLIT_KERNEL__
+	/* TODO(sergey): This is because sd is actually a global space,
+	 * which makes it difficult to re-use same wireframe() function.
+	 *
+	 * With OpenCL 2.0 it's possible to avoid this change, but for until
+	 * then we'll be living with such an exception.
+	 */
+	float3 P = ccl_fetch(sd, P);
+	float f = wireframe(kg, sd, size, pixel_size, &P);
+#else
+	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+#endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = sd->P - sd->dP.dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
+		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
 	}
-	else if (bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = sd->P - sd->dP.dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
+	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
+		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
 	}
 
-	if (stack_valid(out_fac))
+	if(stack_valid(out_fac))
 		stack_store_float(stack, out_fac, f);
 }
 
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 2dc6962633d..4e8a1794813 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -18,7 +18,6 @@ set(SRC
 	attribute.cpp
 	background.cpp
 	bake.cpp
-	blackbody.cpp
 	buffers.cpp
 	camera.cpp
 	film.cpp
@@ -47,7 +46,6 @@ set(SRC_HEADERS
 	attribute.h
 	bake.h
 	background.h
-	blackbody.h
 	buffers.h
 	camera.h
 	film.h
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index 656420f5dbc..6e94459da55 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -52,7 +52,7 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 
 void Attribute::reserve(int numverts, int numtris, int numsteps, int numcurves, int numkeys, bool resize)
 {
-	if (resize) {
+	if(resize) {
 		buffer.resize(buffer_size(numverts, numtris, numsteps, numcurves, numkeys), 0);
 	}
 	else {
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index f5e51f2e159..5fd7bd8f16f 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -93,7 +93,7 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	need_update = false;
 }
 
-void Background::device_free(Device *device, DeviceScene *dscene)
+void Background::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 }
 
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 1e8c1ac8bc9..4bbac0f91d1 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -55,6 +55,11 @@ void BakeData::set(int i, int prim, float uv[2], float dudx, float dudy, float d
 	m_dvdy[i] = dvdy;
 }
 
+void BakeData::set_null(int i)
+{
+	m_primitive[i] = -1;
+}
+
 int BakeData::object()
 {
 	return m_object;
@@ -221,7 +226,10 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 	return true;
 }
 
-void BakeManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void BakeManager::device_update(Device * /*device*/,
+                                DeviceScene * /*dscene*/,
+                                Scene * /*scene*/,
+                                Progress& progress)
 {
 	if(!need_update)
 		return;
@@ -231,7 +239,7 @@ void BakeManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	need_update = false;
 }
 
-void BakeManager::device_free(Device *device, DeviceScene *dscene)
+void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 }
 
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 9ff10dafa0e..14d975a4b4e 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -31,6 +31,7 @@ public:
 	~BakeData();
 
 	void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy);
+	void set_null(int i);
 	int object();
 	size_t size();
 	uint4 data(int i);
diff --git a/intern/cycles/render/blackbody.cpp b/intern/cycles/render/blackbody.cpp
deleted file mode 100644
index 04e6eaf5373..00000000000
--- a/intern/cycles/render/blackbody.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2013, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "blackbody.h"
-#include "util_color.h"
-#include "util_math.h"
-
-#include "kernel_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-vector<float> blackbody_table_build()
-{
-	/* quoted from OSLs opcolor.cpp
-	In order to speed up the blackbody computation, we have a table
-	storing the precomputed BB values for a range of temperatures.  Less
-	than BB_DRAPER always returns 0.  Greater than BB_MAX_TABLE_RANGE
-	does the full computation, we think it'll be rare to inquire higher
-	temperatures.
-
-	Since the bb function is so nonlinear, we actually space the table
-	entries nonlinearly, with the relationship between the table index i
-	and the temperature T as follows:
-	i = ((T-Draper)/spacing)^(1/xpower)
-	T = pow(i, xpower) * spacing + Draper
-	And furthermore, we store in the table the true value raised ^(1/5).
-	I tuned this a bit, and with the current values we can have all
-	blackbody results accurate to within 0.1% with a table size of 317
-	(about 5 KB of data).
-	*/
-
-	const float cie_colour_match[81][3] = {
-		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
-		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
-		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
-		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
-		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
-		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
-		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
-		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
-		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
-		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
-		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
-		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
-		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
-		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
-		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
-		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
-		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
-		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
-		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
-		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
-		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
-		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
-		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
-		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
-		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
-		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
-		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
-	};
-
-	const double c1 = 3.74183e-16; // 2*pi*h*c^2, W*m^2
-	const double c2 = 1.4388e-2;   // h*c/k, m*K
-								   // h is Planck's const, k is Boltzmann's
-	const float dlambda = 5.0f * 1e-9f;  // in meters
-
-	/* Blackbody table from 800 to 12k Kelvin (319 entries (317+2 offset) * 3) */
-	vector<float> blackbody_table(956);
-
-	float X, Y, Z;
-
-	/* ToDo: bring this back to what OSL does with the lastTemperature limit ? */
-	for (int i = 0;  i <= 317;  ++i) {
-		double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPER;
-		X = 0;
-		Y = 0;
-		Z = 0;
-
-		/* from OSL "spectrum_to_XYZ" */
-		for (int n = 0; n < 81; ++n) {
-			float lambda = 380.0f + 5.0f * n;
-			double wlm = lambda * 1e-9f;   // Wavelength in meters
-			// N.B. spec_intens returns result in W/m^2 but it's a differential,
-			// needs to be scaled by dlambda!
-			float spec_intens = float((c1 * pow(wlm, -5.0)) / (exp(c2 / (wlm * Temperature)) -1.0));
-			float Me = spec_intens * dlambda;
-
-			X += Me * cie_colour_match[n][0];
-			Y += Me * cie_colour_match[n][1];
-			Z += Me * cie_colour_match[n][2];
-		}
-		
-		/* Convert from xyz color space */
-		float3 col = xyz_to_rgb(X, Y, Z);
-
-		/* Clamp to zero if values are smaller */
-		col = max(col, make_float3(0.0f, 0.0f, 0.0f));
-
-		col.x = powf(col.x, 1.0f / BB_TABLE_YPOWER);
-		col.y = powf(col.y, 1.0f / BB_TABLE_YPOWER);
-		col.z = powf(col.z, 1.0f / BB_TABLE_YPOWER);
-
-		/* Store in table in RRRGGGBBB format */
-		blackbody_table[i] = col.x;
-		blackbody_table[i+319*1] = col.y;
-		blackbody_table[i+319*2] = col.z;	
-	}
-
-	return blackbody_table;
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/blackbody.h b/intern/cycles/render/blackbody.h
deleted file mode 100644
index 6b752a227fa..00000000000
--- a/intern/cycles/render/blackbody.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __BLACKBODY_H__
-#define __BLACKBODY_H__
-
-#include "util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-vector<float> blackbody_table_build();
-
-CCL_NAMESPACE_END
-
-#endif /* __BLACKBODY_H__ */
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 5202bf5862c..fab3f701757 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -187,7 +187,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 			else if(type == PASS_MIST) {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
 					float f = *in;
-					pixels[0] = clamp(f*scale_exposure, 0.0f, 1.0f);
+					pixels[0] = saturate(f*scale_exposure);
 				}
 			}
 #ifdef WITH_CYCLES_DEBUG
@@ -197,6 +197,12 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pixels[0] = f;
 				}
 			}
+			else if(type == PASS_RAY_BOUNCES) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f;
+				}
+			}
 #endif
 			else {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
@@ -298,7 +304,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pixels[2] = f.z*scale_exposure;
 
 					/* clamp since alpha might be > 1.0 due to russian roulette */
-					pixels[3] = clamp(f.w*scale, 0.0f, 1.0f);
+					pixels[3] = saturate(f.w*scale);
 				}
 			}
 		}
@@ -369,13 +375,9 @@ void DisplayBuffer::draw_set(int width, int height)
 void DisplayBuffer::draw(Device *device, const DeviceDrawParams& draw_params)
 {
 	if(draw_width != 0 && draw_height != 0) {
-		glPushMatrix();
-		glTranslatef(params.full_x, params.full_y, 0.0f);
 		device_memory& rgba = rgba_data();
 
-		device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent, draw_params);
-
-		glPopMatrix();
+		device->draw_pixels(rgba, 0, draw_width, draw_height, params.full_x, params.full_y, params.width, params.height, transparent, draw_params);
 	}
 }
 
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index ea9b853d454..89505221862 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -80,6 +80,7 @@ Camera::Camera()
 
 	need_update = true;
 	need_device_update = true;
+	need_flags_update = true;
 	previous_need_motion = -1;
 }
 
@@ -162,8 +163,8 @@ void Camera::update()
 		     transform_perspective(&rastertocamera, make_float3(0, 0, 0));
 	}
 	else {
-		dx = make_float3(0, 0, 0);
-		dy = make_float3(0, 0, 0);
+		dx = make_float3(0.0f, 0.0f, 0.0f);
+		dy = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	dx = transform_direction(&cameratoworld, dx);
@@ -171,6 +172,7 @@ void Camera::update()
 
 	need_update = false;
 	need_device_update = true;
+	need_flags_update = true;
 }
 
 void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -179,7 +181,7 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
 	update();
 
-	if (previous_need_motion != need_motion) {
+	if(previous_need_motion != need_motion) {
 		/* scene's motion model could have been changed since previous device
 		 * camera update this could happen for example in case when one render
 		 * layer has got motion pass and another not */
@@ -284,11 +286,11 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	previous_need_motion = need_motion;
 }
 
-void Camera::device_update_volume(Device *device,
+void Camera::device_update_volume(Device * /*device*/,
                                   DeviceScene *dscene,
                                   Scene *scene)
 {
-	if(!need_device_update) {
+	if(!need_device_update && !need_flags_update) {
 		return;
 	}
 	KernelCamera *kcam = &dscene->data.cam;
@@ -304,9 +306,10 @@ void Camera::device_update_volume(Device *device,
 		}
 	}
 	need_device_update = false;
+	need_flags_update = false;
 }
 
-void Camera::device_free(Device *device, DeviceScene *dscene)
+void Camera::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
 	/* nothing to free, only writing to constant memory */
 }
@@ -368,7 +371,7 @@ float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
 		 */
 		P += nearclip * D / Pclip.z;
 	}
-	else if (type == CAMERA_ORTHOGRAPHIC) {
+	else if(type == CAMERA_ORTHOGRAPHIC) {
 		D = make_float3(0.0f, 0.0f, 1.0f);
 		/* TODO(sergey): Aperture support? */
 		P = transform_perspective(&rastertocamera,
@@ -400,7 +403,7 @@ BoundBox Camera::viewplane_bounds_get()
 		bounds.grow(transform_raster_to_world((float)width, (float)height));
 		bounds.grow(transform_raster_to_world((float)width, 0.0f));
 		if(type == CAMERA_PERSPECTIVE) {
-			/* Center point has the most distancei in local Z axis,
+			/* Center point has the most distance in local Z axis,
 			 * use it to construct bounding box/
 			 */
 			bounds.grow(transform_raster_to_world(0.5f*width, 0.5f*height));
@@ -409,15 +412,4 @@ BoundBox Camera::viewplane_bounds_get()
 	return bounds;
 }
 
-Transform Camera::transform_from_viewplane(BoundBox2D &viewplane)
-{
-	return
-		transform_scale(1.0f / (viewplane.right - viewplane.left),
-		                1.0f / (viewplane.top - viewplane.bottom),
-		                1.0f) *
-		transform_translate(-viewplane.left,
-		                    -viewplane.bottom,
-		                    0.0f);
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index e1faee3543d..3efbe904e2f 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -105,6 +105,7 @@ public:
 	/* update */
 	bool need_update;
 	bool need_device_update;
+	bool need_flags_update;
 	int previous_need_motion;
 
 	/* functions */
@@ -123,9 +124,12 @@ public:
 	bool motion_modified(const Camera& cam);
 	void tag_update();
 
+	/* Public utility functions. */
 	BoundBox viewplane_bounds_get();
+
+private:
+	/* Private utility functions. */
 	float3 transform_raster_to_world(float raster_x, float raster_y);
-	Transform transform_from_viewplane(BoundBox2D &viewplane);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 80dc6434cde..f671eb19cae 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -103,7 +103,10 @@ CurveSystemManager::~CurveSystemManager()
 {
 }
 
-void CurveSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void CurveSystemManager::device_update(Device *device,
+                                       DeviceScene *dscene,
+                                       Scene * /*scene*/,
+                                       Progress& progress)
 {
 	if(!need_update)
 		return;
@@ -144,7 +147,8 @@ void CurveSystemManager::device_update(Device *device, DeviceScene *dscene, Scen
 	need_update = false;
 }
 
-void CurveSystemManager::device_free(Device *device, DeviceScene *dscene)
+void CurveSystemManager::device_free(Device * /*device*/,
+                                     DeviceScene * /*dscene*/)
 {
 
 }
@@ -174,7 +178,7 @@ bool CurveSystemManager::modified_mesh(const CurveSystemManager& CurveSystemMana
 		use_curves == CurveSystemManager.use_curves);
 }
 
-void CurveSystemManager::tag_update(Scene *scene)
+void CurveSystemManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index c6d12928dd4..7282b04a22e 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -151,6 +151,14 @@ void Pass::add(PassType type, vector<Pass>& passes)
 			pass.components = 1;
 			pass.exposure = false;
 			break;
+		case PASS_BVH_TRAVERSED_INSTANCES:
+			pass.components = 1;
+			pass.exposure = false;
+			break;
+		case PASS_RAY_BOUNCES:
+			pass.components = 1;
+			pass.exposure = false;
+			break;
 #endif
 	}
 
@@ -187,7 +195,7 @@ bool Pass::contains(const vector<Pass>& passes, PassType type)
 
 /* Pixel Filter */
 
-static float filter_func_box(float v, float width)
+static float filter_func_box(float /*v*/, float /*width*/)
 {
 	return 1.0f;
 }
@@ -399,6 +407,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 			case PASS_BVH_TRAVERSAL_STEPS:
 				kfilm->pass_bvh_traversal_steps = kfilm->pass_stride;
 				break;
+			case PASS_BVH_TRAVERSED_INSTANCES:
+				kfilm->pass_bvh_traversed_instances = kfilm->pass_stride;
+				break;
+			case PASS_RAY_BOUNCES:
+				kfilm->pass_ray_bounces = kfilm->pass_stride;
+				break;
 #endif
 
 			case PASS_NONE:
@@ -424,7 +438,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	need_update = false;
 }
 
-void Film::device_free(Device *device, DeviceScene *dscene, Scene *scene)
+void Film::device_free(Device * /*device*/,
+                       DeviceScene * /*dscene*/,
+                       Scene *scene)
 {
 	if(filter_table_offset != TABLE_OFFSET_INVALID) {
 		scene->lookup_tables->remove_table(filter_table_offset);
@@ -459,7 +475,7 @@ void Film::tag_passes_update(Scene *scene, const vector<Pass>& passes_)
 	passes = passes_;
 }
 
-void Film::tag_update(Scene *scene)
+void Film::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 9896eaba89b..e0537101247 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -33,7 +33,7 @@ ShaderInput::ShaderInput(ShaderNode *parent_, const char *name_, ShaderSocketTyp
 	name = name_;
 	type = type_;
 	link = NULL;
-	value = make_float3(0, 0, 0);
+	value = make_float3(0.0f, 0.0f, 0.0f);
 	stack_offset = SVM_STACK_INVALID;
 	default_value = NONE;
 	usage = USE_ALL;
@@ -404,6 +404,36 @@ void ShaderGraph::remove_unneeded_nodes()
 				}
 			}
 		}
+		else if(node->special_type == SHADER_SPECIAL_TYPE_EMISSION) {
+			EmissionNode *em = static_cast<EmissionNode*>(node);
+
+			if(em->outputs[0]->links.size()) {
+				/* Black color or zero strength, remove node */
+				if((!em->inputs[0]->link && em->inputs[0]->value == make_float3(0.0f, 0.0f, 0.0f)) ||
+				   (!em->inputs[1]->link && em->inputs[1]->value.x == 0.0f)) {
+					vector<ShaderInput*> inputs = em->outputs[0]->links;
+
+					relink(em->inputs, inputs, NULL);
+					removed[em->id] = true;
+					any_node_removed = true;
+				}
+			}
+		}
+		else if(node->special_type == SHADER_SPECIAL_TYPE_BUMP) {
+			BumpNode *bump = static_cast<BumpNode*>(node);
+
+			if(bump->outputs[0]->links.size()) {
+				/* Height input not connected */
+				/* ToDo: Strength zero? */
+				if(!bump->inputs[0]->link) {
+					vector<ShaderInput*> inputs = bump->outputs[0]->links;
+
+					relink(bump->inputs, inputs, NULL);
+					removed[bump->id] = true;
+					any_node_removed = true;
+				}
+			}
+		}
 		else if(node->special_type == SHADER_SPECIAL_TYPE_MIX_CLOSURE) {
 			MixClosureNode *mix = static_cast<MixClosureNode*>(node);
 
@@ -545,7 +575,7 @@ void ShaderGraph::clean()
 		else
 			delete node;
 	}
-	
+
 	nodes = newnodes;
 }
 
@@ -727,10 +757,18 @@ void ShaderGraph::bump_from_displacement()
 	/* connect bump output to normal input nodes that aren't set yet. actually
 	 * this will only set the normal input to the geometry node that we created
 	 * and connected to all other normal inputs already. */
-	foreach(ShaderNode *node, nodes)
-		foreach(ShaderInput *input, node->inputs)
+	foreach(ShaderNode *node, nodes) {
+		/* Don't connect normal to the bump node we're coming from,
+		 * otherwise it'll be a cycle in graph.
+		 */
+		if(node == bump) {
+			continue;
+		}
+		foreach(ShaderInput *input, node->inputs) {
 			if(!input->link && input->default_value == ShaderInput::NORMAL)
 				connect(set_normal->output("Normal"), input);
+		}
+	}
 
 	/* for displacement bump, clear the normal input in case the above loop
 	 * connected the setnormal out to the bump normalin */
@@ -825,6 +863,26 @@ void ShaderGraph::transform_multi_closure(ShaderNode *node, ShaderOutput *weight
 	}
 }
 
+int ShaderGraph::get_num_closures()
+{
+	int num_closures = 0;
+	foreach(ShaderNode *node, nodes) {
+		if(node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
+			BsdfNode *bsdf_node = static_cast<BsdfNode*>(node);
+			/* TODO(sergey): Make it more generic approach, maybe some utility
+			 * macros like CLOSURE_IS_FOO()?
+			 */
+			if(CLOSURE_IS_BSSRDF(bsdf_node->closure))
+				num_closures = num_closures + 3;
+			else if(CLOSURE_IS_GLASS(bsdf_node->closure))
+				num_closures = num_closures + 2;
+			else
+				num_closures = num_closures + 1;
+		}
+	}
+	return num_closures;
+}
+
 void ShaderGraph::dump_graph(const char *filename)
 {
 	FILE *fd = fopen(filename, "w");
@@ -836,29 +894,59 @@ void ShaderGraph::dump_graph(const char *filename)
 
 	fprintf(fd, "digraph shader_graph {\n");
 	fprintf(fd, "ranksep=1.5\n");
+	fprintf(fd, "rankdir=LR\n");
 	fprintf(fd, "splines=false\n");
 
 	foreach(ShaderNode *node, nodes) {
 		fprintf(fd, "// NODE: %p\n", node);
-		fprintf(fd,
-		        "\"%p\" [shape=record,label=\"%s\"]\n",
-		        node,
-		        node->name.c_str());
+		fprintf(fd, "\"%p\" [shape=record,label=\"{", node);
+		if(node->inputs.size()) {
+			fprintf(fd, "{");
+			foreach(ShaderInput *socket, node->inputs) {
+				if(socket != node->inputs[0]) {
+					fprintf(fd, "|");
+				}
+				fprintf(fd, "<IN_%p>%s", socket, socket->name);
+			}
+			fprintf(fd, "}|");
+		}
+		fprintf(fd, "%s", node->name.c_str());
+		if(node->bump == SHADER_BUMP_CENTER) {
+			fprintf(fd, " (bump:center)");
+		}
+		else if(node->bump == SHADER_BUMP_DX) {
+			fprintf(fd, " (bump:dx)");
+		}
+		else if(node->bump == SHADER_BUMP_DY) {
+			fprintf(fd, " (bump:dy)");
+		}
+		if(node->outputs.size()) {
+			fprintf(fd, "|{");
+			foreach(ShaderOutput *socket, node->outputs) {
+				if(socket != node->outputs[0]) {
+					fprintf(fd, "|");
+				}
+				fprintf(fd, "<OUT_%p>%s", socket, socket->name);
+			}
+			fprintf(fd, "}");
+		}
+		fprintf(fd, "}\"]");
 	}
 
 	foreach(ShaderNode *node, nodes) {
 		foreach(ShaderOutput *output, node->outputs) {
 			foreach(ShaderInput *input, output->links) {
 				fprintf(fd,
-				        "// CONNECTION: %p->%p (%s:%s)\n",
+				        "// CONNECTION: OUT_%p->IN_%p (%s:%s)\n",
 				        output,
 				        input,
 				        output->name, input->name);
 				fprintf(fd,
-				        "\"%p\":s -> \"%p\":n [label=\"%s:%s\"]\n",
+				        "\"%p\":\"OUT_%p\":e -> \"%p\":\"IN_%p\":w [label=\"\"]\n",
 				        output->parent,
+				        output,
 				        input->parent,
-				        output->name, input->name);
+				        input);
 			}
 		}
 	}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index b39b3dae324..9117fd03a95 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -81,6 +81,10 @@ enum ShaderNodeSpecialType {
 	SHADER_SPECIAL_TYPE_GEOMETRY,
 	SHADER_SPECIAL_TYPE_SCRIPT,
 	SHADER_SPECIAL_TYPE_BACKGROUND,
+	SHADER_SPECIAL_TYPE_IMAGE_SLOT,
+	SHADER_SPECIAL_TYPE_CLOSURE,
+	SHADER_SPECIAL_TYPE_EMISSION,
+	SHADER_SPECIAL_TYPE_BUMP,
 };
 
 /* Enum
@@ -193,7 +197,6 @@ public:
 	virtual bool has_surface_emission() { return false; }
 	virtual bool has_surface_transparent() { return false; }
 	virtual bool has_surface_bssrdf() { return false; }
-	virtual bool has_converter_blackbody() { return false; }
 	virtual bool has_bssrdf_bump() { return false; }
 	virtual bool has_spatial_varying() { return false; }
 	virtual bool has_object_dependency() { return false; }
@@ -206,6 +209,24 @@ public:
 	ShaderBump bump; /* for bump mapping utility */
 	
 	ShaderNodeSpecialType special_type;	/* special node type */
+
+	/* ** Selective nodes compilation ** */
+
+	/* TODO(sergey): More explicitly mention in the function names
+	 * that those functions are for selective compilation only?
+	 */
+
+	/* Nodes are split into several groups, group of level 0 contains
+	 * nodes which are most commonly used, further levels are extension
+	 * of previous one and includes less commonly used nodes.
+	 */
+	virtual int get_group() { return NODE_GROUP_LEVEL_0; }
+
+	/* Node feature are used to disable huge nodes inside the group,
+	 * so it's possible to disable huge nodes inside of the required
+	 * nodes group.
+	 */
+	virtual int get_feature() { return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP; }
 };
 
 
@@ -253,6 +274,8 @@ public:
 	void remove_unneeded_nodes();
 	void finalize(bool do_bump = false, bool do_osl = false);
 
+	int get_num_closures();
+
 	void dump_graph(const char *filename);
 
 protected:
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 61a0a81d51d..c62afcd7719 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -407,7 +407,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 			int scanlinesize = width*components*sizeof(uchar);
 
 			in->read_image(TypeDesc::UINT8,
-				(uchar*)pixels + (height-1)*scanlinesize,
+				(uchar*)pixels + (((size_t)height)-1)*scanlinesize,
 				AutoStride,
 				-scanlinesize,
 				AutoStride);
@@ -425,9 +425,10 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		builtin_image_pixels_cb(img->filename, img->builtin_data, pixels);
 	}
 
+	size_t num_pixels = ((size_t)width) * height * depth;
 	if(cmyk) {
 		/* CMYK */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 			pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
 			pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
@@ -436,7 +437,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 2) {
 		/* grayscale + alpha */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -445,7 +446,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 3) {
 		/* RGB */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -454,7 +455,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 	else if(components == 1) {
 		/* grayscale */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -463,7 +464,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 
 	if(img->use_alpha == false) {
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 		}
 	}
@@ -529,7 +530,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		vector<float> tmppixels;
 
 		if(components > 4) {
-			tmppixels.resize(width*height*components);
+			tmppixels.resize(((size_t)width)*height*components);
 			readpixels = &tmppixels[0];
 		}
 
@@ -547,7 +548,8 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		}
 
 		if(components > 4) {
-			for(int i = width*height-1; i >= 0; i--) {
+			size_t dimensions = ((size_t)width)*height;
+			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
 				pixels[i*4+3] = tmppixels[i*components+3];
 				pixels[i*4+2] = tmppixels[i*components+2];
 				pixels[i*4+1] = tmppixels[i*components+1];
@@ -566,9 +568,10 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
 	}
 
+	size_t num_pixels = ((size_t)width) * height * depth;
 	if(cmyk) {
 		/* CMYK */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 			pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
@@ -577,7 +580,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 2) {
 		/* grayscale + alpha */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -586,7 +589,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 3) {
 		/* RGB */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -595,7 +598,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 	else if(components == 1) {
 		/* grayscale */
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -604,7 +607,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 
 	if(img->use_alpha == false) {
-		for(int i = width*height*depth-1; i >= 0; i--) {
+		for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
 			pixels[i*4+3] = 1.0f;
 		}
 	}
@@ -791,7 +794,36 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 	need_update = false;
 }
 
-void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progress& progess)
+void ImageManager::device_update_slot(Device *device,
+                                      DeviceScene *dscene,
+                                      int slot,
+                                      Progress *progress)
+{
+	Image *image;
+	if(slot >= tex_image_byte_start) {
+		int byte_slot = slot - tex_image_byte_start;
+		assert(images[byte_slot] != NULL);
+		image = images[byte_slot];
+	}
+	else {
+		assert(float_images[slot] != NULL);
+		image = float_images[slot];
+	}
+	if(image->users == 0) {
+		device_free_image(device, dscene, slot);
+	}
+	else if(image->need_load) {
+		if(!osl_texture_system || float_images[slot]->builtin_data)
+			device_load_image(device,
+			                  dscene,
+			                  slot,
+			                  progress);
+	}
+}
+
+void ImageManager::device_pack_images(Device *device,
+                                      DeviceScene *dscene,
+                                      Progress& /*progess*/)
 {
 	/* for OpenCL, we pack all image textures inside a single big texture, and
 	 * will do our own interpolation in the kernel */
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 2f5dcb6efd5..70cc4935daa 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -63,6 +63,7 @@ public:
 	bool is_float_image(const string& filename, void *builtin_data, bool& is_linear);
 
 	void device_update(Device *device, DeviceScene *dscene, Progress& progress);
+	void device_update_slot(Device *device, DeviceScene *dscene, int slot, Progress *progress);
 	void device_free(Device *device, DeviceScene *dscene);
 	void device_free_builtin(Device *device, DeviceScene *dscene);
 
@@ -73,9 +74,9 @@ public:
 
 	bool need_update;
 
-	boost::function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
-	boost::function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
-	boost::function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
+	function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
+	function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
+	function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 17c50ca826f..465d7ea02c6 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -216,7 +216,7 @@ bool Integrator::modified(const Integrator& integrator)
 		sample_all_lights_indirect == integrator.sample_all_lights_indirect);
 }
 
-void Integrator::tag_update(Scene *scene)
+void Integrator::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index b6c7b379a7e..4e962616263 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -129,6 +129,8 @@ Light::Light()
 	shader = 0;
 	samples = 1;
 	max_bounces = 1024;
+
+	is_portal = false;
 }
 
 void Light::tag_update(Scene *scene)
@@ -136,6 +138,17 @@ void Light::tag_update(Scene *scene)
 	scene->light_manager->need_update = true;
 }
 
+bool Light::has_contribution(Scene *scene)
+{
+	if(is_portal) {
+		return false;
+	}
+	if(type == LIGHT_BACKGROUND) {
+		return true;
+	}
+	return scene->shaders[shader]->has_surface_emission;
+}
+
 /* Light Manager */
 
 LightManager::LightManager()
@@ -153,10 +166,17 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	progress.set_status("Updating Lights", "Computing distribution");
 
 	/* count */
-	size_t num_lights = scene->lights.size();
+	size_t num_lights = 0;
 	size_t num_background_lights = 0;
 	size_t num_triangles = 0;
 
+	bool background_mis = false;
+
+	foreach(Light *light, scene->lights) {
+		if(light->has_contribution(scene))
+			num_lights++;
+	}
+
 	foreach(Object *object, scene->objects) {
 		Mesh *mesh = object->mesh;
 		bool have_emission = false;
@@ -287,22 +307,29 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	float trianglearea = totarea;
 
 	/* point lights */
-	float lightarea = (totarea > 0.0f)? totarea/scene->lights.size(): 1.0f;
+	float lightarea = (totarea > 0.0f) ? totarea / num_lights : 1.0f;
 	bool use_lamp_mis = false;
 
-	for(int i = 0; i < scene->lights.size(); i++, offset++) {
-		Light *light = scene->lights[i];
+	int light_index = 0;
+	foreach(Light *light, scene->lights) {
+		if(!light->has_contribution(scene))
+			continue;
 
 		distribution[offset].x = totarea;
-		distribution[offset].y = __int_as_float(~(int)i);
+		distribution[offset].y = __int_as_float(~light_index);
 		distribution[offset].z = 1.0f;
 		distribution[offset].w = light->size;
 		totarea += lightarea;
 
 		if(light->size > 0.0f && light->use_mis)
 			use_lamp_mis = true;
-		if(light->type == LIGHT_BACKGROUND)
+		if(light->type == LIGHT_BACKGROUND) {
 			num_background_lights++;
+			background_mis = light->use_mis;
+		}
+
+		light_index++;
+		offset++;
 	}
 
 	/* normalize cumulative distribution functions */
@@ -364,6 +391,18 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 
 		/* CDF */
 		device->tex_alloc("__light_distribution", dscene->light_distribution);
+
+		/* Portals */
+		if(num_background_lights > 0 && light_index != scene->lights.size()) {
+			kintegrator->portal_offset = light_index;
+			kintegrator->num_portals = scene->lights.size() - light_index;
+			kintegrator->portal_pdf = background_mis? 0.5f: 1.0f;
+		}
+		else {
+			kintegrator->num_portals = 0;
+			kintegrator->portal_offset = 0;
+			kintegrator->portal_pdf = 0.0f;
+		}
 	}
 	else {
 		dscene->light_distribution.clear();
@@ -374,6 +413,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		kintegrator->pdf_lights = 0.0f;
 		kintegrator->inv_pdf_lights = 0.0f;
 		kintegrator->use_lamp_mis = false;
+		kintegrator->num_portals = 0;
+		kintegrator->portal_offset = 0;
+		kintegrator->portal_pdf = 0.0f;
+
 		kfilm->pass_shadow_scale = 1.0f;
 	}
 }
@@ -389,16 +432,16 @@ static void background_cdf(int start,
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
-		float ave_luminamce = average(env_color);
+		float ave_luminance = average(env_color);
 
-		cond_cdf[i * cdf_count].x = ave_luminamce * sin_theta;
+		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
 
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
-			ave_luminamce = average(env_color);
+			ave_luminance = average(env_color);
 
-			cond_cdf[i * cdf_count + j].x = ave_luminamce * sin_theta;
+			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
 		}
 
@@ -513,10 +556,8 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 	if(scene->lights.size() == 0)
 		return;
 
-	float4 *light_data = dscene->light_data.resize(scene->lights.size()*LIGHT_SIZE);
-
-	if(!device->info.advanced_shading) {
-		/* remove unsupported light */
+	/* remove background light? */
+	if(!(device->info.advanced_shading)) {
 		foreach(Light *light, scene->lights) {
 			if(light->type == LIGHT_BACKGROUND) {
 				scene->lights.erase(std::remove(scene->lights.begin(), scene->lights.end(), light), scene->lights.end());
@@ -525,10 +566,16 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 		}
 	}
 
-	for(size_t i = 0; i < scene->lights.size(); i++) {
-		Light *light = scene->lights[i];
+	float4 *light_data = dscene->light_data.resize(scene->lights.size()*LIGHT_SIZE);
+	int light_index = 0;
+
+	foreach(Light *light, scene->lights) {
+		if(!light->has_contribution(scene)) {
+			continue;
+		}
+
 		float3 co = light->co;
-		int shader_id = scene->shader_manager->get_shader_id(scene->lights[i]->shader);
+		int shader_id = scene->shader_manager->get_shader_id(light->shader);
 		float samples = __int_as_float(light->samples);
 		float max_bounces = __int_as_float(light->max_bounces);
 
@@ -561,11 +608,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_DISTANT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -582,11 +629,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_BACKGROUND) {
 			uint visibility = scene->background->visibility;
@@ -611,11 +658,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 				use_light_visibility = true;
 			}
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_AREA) {
 			float3 axisu = light->axisu*(light->sizeu*light->size);
@@ -629,11 +676,11 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
-			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_SPOT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -649,19 +696,52 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
 
-			light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
-			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
-			light_data[i*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
-			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
+			light_data[light_index*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
+			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
+
+		light_index++;
+	}
+
+	/* TODO(sergey): Consider moving portals update to their own function
+	 * keeping this one more manageable.
+	 */
+	foreach(Light *light, scene->lights) {
+		if(!light->is_portal)
+			continue;
+		assert(light->type == LIGHT_AREA);
+
+		float3 co = light->co;
+		float3 axisu = light->axisu*(light->sizeu*light->size);
+		float3 axisv = light->axisv*(light->sizev*light->size);
+		float area = len(axisu)*len(axisv);
+		float invarea = (area > 0.0f) ? 1.0f / area : 1.0f;
+		float3 dir = light->dir;
+
+		dir = safe_normalize(dir);
+
+		light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z);
+		light_data[light_index*LIGHT_SIZE + 1] = make_float4(area, axisu.x, axisu.y, axisu.z);
+		light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
+		light_data[light_index*LIGHT_SIZE + 3] = make_float4(-1, dir.x, dir.y, dir.z);
+		light_data[light_index*LIGHT_SIZE + 4] = make_float4(-1, 0.0f, 0.0f, 0.0f);
+
+		light_index++;
 	}
-	
+
+	VLOG(1) << "Number of lights without contribution: "
+	        << scene->lights.size() - light_index;
+
 	device->tex_alloc("__light_data", dscene->light_data);
 }
 
 void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->lights.size() << " lights.";
+
 	if(!need_update)
 		return;
 
@@ -699,7 +779,7 @@ void LightManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->light_background_conditional_cdf.clear();
 }
 
-void LightManager::tag_update(Scene *scene)
+void LightManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 1f8eac6a97f..afec3628dda 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -56,11 +56,16 @@ public:
 	bool use_transmission;
 	bool use_scatter;
 
+	bool is_portal;
+
 	int shader;
 	int samples;
 	int max_bounces;
 
 	void tag_update(Scene *scene);
+
+	/* Check whether the light has contribution the the scene. */
+	bool has_contribution(Scene *scene);
 };
 
 class LightManager {
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 9c7310d4a05..45685fe5927 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -20,9 +20,11 @@
 #include "camera.h"
 #include "curves.h"
 #include "device.h"
+#include "graph.h"
 #include "shader.h"
 #include "light.h"
 #include "mesh.h"
+#include "nodes.h"
 #include "object.h"
 #include "scene.h"
 
@@ -135,7 +137,7 @@ void Mesh::clear()
 	transform_applied = false;
 	transform_negative_scaled = false;
 	transform_normal = transform_identity();
-	geometry_synced = false;
+	geometry_flags = GEOMETRY_NONE;
 }
 
 int Mesh::split_vertex(int vertex)
@@ -210,11 +212,11 @@ void Mesh::compute_bounds()
 			bnds.grow(float4_to_float3(curve_keys[i]), curve_keys[i].w);
 
 		Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-		if (use_motion_blur && attr) {
+		if(use_motion_blur && attr) {
 			size_t steps_size = verts.size() * (motion_steps - 1);
 			float3 *vert_steps = attr->data_float3();
 	
-			for (size_t i = 0; i < steps_size; i++)
+			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(vert_steps[i]);
 		}
 
@@ -223,7 +225,7 @@ void Mesh::compute_bounds()
 			size_t steps_size = curve_keys.size() * (motion_steps - 1);
 			float3 *key_steps = curve_attr->data_float3();
 	
-			for (size_t i = 0; i < steps_size; i++)
+			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(key_steps[i]);
 		}
 
@@ -237,19 +239,19 @@ void Mesh::compute_bounds()
 			for(size_t i = 0; i < curve_keys_size; i++)
 				bnds.grow_safe(float4_to_float3(curve_keys[i]), curve_keys[i].w);
 			
-			if (use_motion_blur && attr) {
+			if(use_motion_blur && attr) {
 				size_t steps_size = verts.size() * (motion_steps - 1);
 				float3 *vert_steps = attr->data_float3();
 		
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(vert_steps[i]);
 			}
 
-			if (use_motion_blur && curve_attr) {
+			if(use_motion_blur && curve_attr) {
 				size_t steps_size = curve_keys.size() * (motion_steps - 1);
 				float3 *key_steps = curve_attr->data_float3();
 		
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(key_steps[i]);
 			}
 		}
@@ -653,6 +655,10 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 			}
 		}
 	}
+#else
+	(void)device;
+	(void)scene;
+	(void)mesh_attributes;
 #endif
 }
 
@@ -1095,6 +1101,10 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->bvh_nodes.reference((float4*)&pack.nodes[0], pack.nodes.size());
 		device->tex_alloc("__bvh_nodes", dscene->bvh_nodes);
 	}
+	if(pack.leaf_nodes.size()) {
+		dscene->bvh_leaf_nodes.reference((float4*)&pack.leaf_nodes[0], pack.leaf_nodes.size());
+		device->tex_alloc("__bvh_leaf_nodes", dscene->bvh_leaf_nodes);
+	}
 	if(pack.object_node.size()) {
 		dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size());
 		device->tex_alloc("__object_node", dscene->object_node);
@@ -1124,7 +1134,10 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
 }
 
-void MeshManager::device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void MeshManager::device_update_flags(Device * /*device*/,
+                                      DeviceScene * /*dscene*/,
+                                      Scene * scene,
+                                      Progress& /*progress*/)
 {
 	if(!need_update && !need_flags_update) {
 		return;
@@ -1141,8 +1154,59 @@ void MeshManager::device_update_flags(Device *device, DeviceScene *dscene, Scene
 	need_flags_update = false;
 }
 
+void MeshManager::device_update_displacement_images(Device *device,
+                                                    DeviceScene *dscene,
+                                                    Scene *scene,
+                                                    Progress& progress)
+{
+	progress.set_status("Updating Displacement Images");
+	TaskPool pool;
+	ImageManager *image_manager = scene->image_manager;
+	set<int> bump_images;
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update) {
+			foreach(uint shader_index, mesh->used_shaders) {
+				Shader *shader = scene->shaders[shader_index];
+				if(shader->graph_bump == NULL) {
+					continue;
+				}
+				foreach(ShaderNode* node, shader->graph_bump->nodes) {
+					if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
+						continue;
+					}
+					if(device->info.pack_images) {
+						/* If device requires packed images we need to update all
+						 * images now, even if they're not used for displacement.
+						 */
+						image_manager->device_update(device,
+						                             dscene,
+						                             progress);
+						return;
+					}
+					ImageSlotNode *image_node = static_cast<ImageSlotNode*>(node);
+					int slot = image_node->slot;
+					if(slot != -1) {
+						bump_images.insert(slot);
+					}
+				}
+			}
+		}
+	}
+	foreach(int slot, bump_images) {
+		pool.push(function_bind(&ImageManager::device_update_slot,
+		                        image_manager,
+		                        device,
+		                        dscene,
+		                        slot,
+		                        &progress));
+	}
+	pool.wait_work();
+}
+
 void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
+
 	if(!need_update)
 		return;
 
@@ -1161,6 +1225,28 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 		}
 	}
 
+	/* Update images needed for true displacement. */
+	bool need_displacement_images = false;
+	bool old_need_object_flags_update = false;
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update &&
+		   mesh->displacement_method != Mesh::DISPLACE_BUMP)
+		{
+			need_displacement_images = true;
+			break;
+		}
+	}
+	if(need_displacement_images) {
+		VLOG(1) << "Updating images used for true displacement.";
+		device_update_displacement_images(device, dscene, scene, progress);
+		old_need_object_flags_update = scene->object_manager->need_flags_update;
+		scene->object_manager->device_update_flags(device,
+		                                           dscene,
+		                                           scene,
+		                                           progress,
+		                                           false);
+	}
+
 	/* device update */
 	device_free(device, dscene);
 
@@ -1208,7 +1294,9 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 			                        &progress,
 			                        i,
 			                        num_bvh));
-			i++;
+			if(!mesh->transform_applied) {
+				i++;
+			}
 		}
 	}
 
@@ -1233,11 +1321,22 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	device_update_bvh(device, dscene, scene, progress);
 
 	need_update = false;
+
+	if(need_displacement_images) {
+		/* Re-tag flags for update, so they're re-evaluated
+		 * for meshes with correct bounding boxes.
+		 *
+		 * This wouldn't cause wrong results, just true
+		 * displacement might be less optimal ot calculate.
+		 */
+		scene->object_manager->need_flags_update = old_need_object_flags_update;
+	}
 }
 
 void MeshManager::device_free(Device *device, DeviceScene *dscene)
 {
 	device->tex_free(dscene->bvh_nodes);
+	device->tex_free(dscene->bvh_leaf_nodes);
 	device->tex_free(dscene->object_node);
 	device->tex_free(dscene->tri_woop);
 	device->tex_free(dscene->prim_type);
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 62e775e5bc9..76c186a3feb 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -71,8 +71,13 @@ public:
 	ustring name;
 
 	/* Mesh Data */
-	bool geometry_synced;  /* used to distinguish meshes with no verts
-	                          and meshed for which geometry is not created */
+	enum GeometryFlags {
+		GEOMETRY_NONE      = 0,
+		GEOMETRY_TRIANGLES = (1 << 0),
+		GEOMETRY_CURVES    = (1 << 1),
+	};
+	int geometry_flags;  /* used to distinguish meshes with no verts
+	                        and meshed for which geometry is not created */
 
 	vector<float3> verts;
 	vector<Triangle> triangles;
@@ -162,6 +167,7 @@ public:
 	void device_update_attributes(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+	void device_update_displacement_images(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 7a39811cacd..69ae2078216 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -186,7 +186,7 @@ ShaderEnum ImageTextureNode::color_space_enum = color_space_init();
 ShaderEnum ImageTextureNode::projection_enum = image_projection_init();
 
 ImageTextureNode::ImageTextureNode()
-: TextureNode("image_texture")
+: ImageSlotTextureNode("image_texture")
 {
 	image_manager = NULL;
 	slot = -1;
@@ -227,7 +227,7 @@ void ImageTextureNode::attributes(Shader *shader, AttributeRequestSet *attribute
 #ifdef WITH_PTEX
 	/* todo: avoid loading other texture coordinates when using ptex,
 	 * and hide texture coordinate socket in the UI */
-	if (shader->has_surface && string_endswith(filename, ".ptx")) {
+	if(shader->has_surface && string_endswith(filename, ".ptx")) {
 		/* ptex */
 		attributes->add(ATTR_STD_PTEX_FACE_ID);
 		attributes->add(ATTR_STD_PTEX_UV);
@@ -380,7 +380,7 @@ ShaderEnum EnvironmentTextureNode::color_space_enum = color_space_init();
 ShaderEnum EnvironmentTextureNode::projection_enum = env_projection_init();
 
 EnvironmentTextureNode::EnvironmentTextureNode()
-: TextureNode("environment_texture")
+: ImageSlotTextureNode("environment_texture")
 {
 	image_manager = NULL;
 	slot = -1;
@@ -417,7 +417,7 @@ ShaderNode *EnvironmentTextureNode::clone() const
 void EnvironmentTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 #ifdef WITH_PTEX
-	if (shader->has_surface && string_endswith(filename, ".ptx")) {
+	if(shader->has_surface && string_endswith(filename, ".ptx")) {
 		/* ptex */
 		attributes->add(ATTR_STD_PTEX_FACE_ID);
 		attributes->add(ATTR_STD_PTEX_UV);
@@ -632,7 +632,7 @@ static void sky_texture_precompute_new(SunSky *sunsky, float3 dir, float turbidi
 	sky_state = arhosek_xyz_skymodelstate_alloc_init(turbidity, ground_albedo, solarElevation);
 
 	/* Copy values from sky_state to SunSky */
-	for (int i = 0; i < 9; ++i) {
+	for(int i = 0; i < 9; ++i) {
 		sunsky->config_x[i] = (float)sky_state->configs[0][i];
 		sunsky->config_y[i] = (float)sky_state->configs[1][i];
 		sunsky->config_z[i] = (float)sky_state->configs[2][i];
@@ -1507,11 +1507,11 @@ ProxyNode::ProxyNode(ShaderSocketType type_)
 	add_output("Output", type);
 }
 
-void ProxyNode::compile(SVMCompiler& compiler)
+void ProxyNode::compile(SVMCompiler& /*compiler*/)
 {
 }
 
-void ProxyNode::compile(OSLCompiler& compiler)
+void ProxyNode::compile(OSLCompiler& /*compiler*/)
 {
 }
 
@@ -1520,6 +1520,8 @@ void ProxyNode::compile(OSLCompiler& compiler)
 BsdfNode::BsdfNode(bool scattering_)
 : ShaderNode("bsdf"), scattering(scattering_)
 {
+	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
+
 	add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
 	add_input("Normal", SHADER_SOCKET_NORMAL, ShaderInput::NORMAL);
 	add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -1587,7 +1589,7 @@ void BsdfNode::compile(SVMCompiler& compiler)
 	compile(compiler, NULL, NULL);
 }
 
-void BsdfNode::compile(OSLCompiler& compiler)
+void BsdfNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -1609,6 +1611,7 @@ ShaderEnum AnisotropicBsdfNode::distribution_enum = aniso_distribution_init();
 
 AnisotropicBsdfNode::AnisotropicBsdfNode()
 {
+	closure = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 	distribution = ustring("GGX");
 
 	add_input("Tangent", SHADER_SOCKET_VECTOR, ShaderInput::TANGENT);
@@ -1661,6 +1664,7 @@ ShaderEnum GlossyBsdfNode::distribution_enum = glossy_distribution_init();
 
 GlossyBsdfNode::GlossyBsdfNode()
 {
+	closure = CLOSURE_BSDF_MICROFACET_GGX_ID;
 	distribution = ustring("GGX");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.2f);
@@ -1699,6 +1703,7 @@ ShaderEnum GlassBsdfNode::distribution_enum = glass_distribution_init();
 
 GlassBsdfNode::GlassBsdfNode()
 {
+	closure = CLOSURE_BSDF_SHARP_GLASS_ID;
 	distribution = ustring("Sharp");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.0f);
@@ -1738,6 +1743,7 @@ ShaderEnum RefractionBsdfNode::distribution_enum = refraction_distribution_init(
 
 RefractionBsdfNode::RefractionBsdfNode()
 {
+	closure = CLOSURE_BSDF_REFRACTION_ID;
 	distribution = ustring("Sharp");
 
 	add_input("Roughness", SHADER_SOCKET_FLOAT, 0.0f);
@@ -1776,6 +1782,7 @@ ShaderEnum ToonBsdfNode::component_enum = toon_component_init();
 
 ToonBsdfNode::ToonBsdfNode()
 {
+	closure = CLOSURE_BSDF_DIFFUSE_TOON_ID;
 	component = ustring("Diffuse");
 
 	add_input("Size", SHADER_SOCKET_FLOAT, 0.5f);
@@ -1916,6 +1923,8 @@ bool SubsurfaceScatteringNode::has_bssrdf_bump()
 EmissionNode::EmissionNode()
 : ShaderNode("emission")
 {
+	special_type = SHADER_SPECIAL_TYPE_EMISSION;
+
 	add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
 	add_input("Strength", SHADER_SOCKET_FLOAT, 10.0f);
 	add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -2078,7 +2087,7 @@ void VolumeNode::compile(SVMCompiler& compiler)
 	compile(compiler, NULL, NULL);
 }
 
-void VolumeNode::compile(OSLCompiler& compiler)
+void VolumeNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -2135,6 +2144,7 @@ ShaderEnum HairBsdfNode::component_enum = hair_component_init();
 
 HairBsdfNode::HairBsdfNode()
 {
+	closure = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	component = ustring("Reflection");
 
 	add_input("Offset", SHADER_SOCKET_FLOAT);
@@ -2248,10 +2258,15 @@ void GeometryNode::compile(SVMCompiler& compiler)
 	out = output("Pointiness");
 	if(!out->links.empty()) {
 		compiler.stack_assign(out);
-		compiler.add_node(attr_node,
-		                  ATTR_STD_POINTINESS,
-		                  out->stack_offset,
-		                  NODE_ATTR_FLOAT);
+		if(compiler.output_type() != SHADER_TYPE_VOLUME) {
+			compiler.add_node(attr_node,
+			                  ATTR_STD_POINTINESS,
+			                  out->stack_offset,
+			                  NODE_ATTR_FLOAT);
+		}
+		else {
+			compiler.add_node(NODE_VALUE_F, __float_as_int(0.0f), out->stack_offset);
+		}
 	}
 }
 
@@ -2442,7 +2457,7 @@ void UVMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 	if(shader->has_surface) {
 		if(!from_dupli) {
 			if(!output("UV")->links.empty()) {
-				if (attribute != "")
+				if(attribute != "")
 					attributes->add(attribute);
 				else
 					attributes->add(ATTR_STD_UV);
@@ -2475,7 +2490,7 @@ void UVMapNode::compile(SVMCompiler& compiler)
 			compiler.add_node(texco_node, NODE_TEXCO_DUPLI_UV, out->stack_offset);
 		}
 		else {
-			if (attribute != "")
+			if(attribute != "")
 				attr = compiler.attribute(attribute);
 			else
 				attr = compiler.attribute(ATTR_STD_UV);
@@ -2916,7 +2931,7 @@ AddClosureNode::AddClosureNode()
 	add_output("Closure",  SHADER_SOCKET_CLOSURE);
 }
 
-void AddClosureNode::compile(SVMCompiler& compiler)
+void AddClosureNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* handled in the SVM compiler */
 }
@@ -2939,7 +2954,7 @@ MixClosureNode::MixClosureNode()
 	add_output("Closure",  SHADER_SOCKET_CLOSURE);
 }
 
-void MixClosureNode::compile(SVMCompiler& compiler)
+void MixClosureNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* handled in the SVM compiler */
 }
@@ -2977,7 +2992,7 @@ void MixClosureWeightNode::compile(SVMCompiler& compiler)
 			weight1_out->stack_offset, weight2_out->stack_offset));
 }
 
-void MixClosureWeightNode::compile(OSLCompiler& compiler)
+void MixClosureWeightNode::compile(OSLCompiler& /*compiler*/)
 {
 	assert(0);
 }
@@ -3647,9 +3662,17 @@ void BlackbodyNode::compile(SVMCompiler& compiler)
 	ShaderInput *temperature_in = input("Temperature");
 	ShaderOutput *color_out = output("Color");
 
-	compiler.stack_assign(temperature_in);
 	compiler.stack_assign(color_out);
-	compiler.add_node(NODE_BLACKBODY, temperature_in->stack_offset, color_out->stack_offset);
+
+	if(temperature_in->link == NULL) {
+		float3 color = svm_math_blackbody_color(temperature_in->value.x);
+		compiler.add_node(NODE_VALUE_V, color_out->stack_offset);
+		compiler.add_node(NODE_VALUE_V, color);
+	}
+	else {
+		compiler.stack_assign(temperature_in);
+		compiler.add_node(NODE_BLACKBODY, temperature_in->stack_offset, color_out->stack_offset);
+	}
 }
 
 void BlackbodyNode::compile(OSLCompiler& compiler)
@@ -3747,7 +3770,7 @@ void MathNode::compile(SVMCompiler& compiler)
 		                                 value1_in->value.x,
 		                                 value2_in->value.x);
 		if(use_clamp) {
-			optimized_value = clamp(optimized_value, 0.0f, 1.0f);
+			optimized_value = saturate(optimized_value);
 		}
 		compiler.add_node(NODE_VALUE_F,
 		                  __float_as_int(optimized_value),
@@ -3911,6 +3934,8 @@ BumpNode::BumpNode()
 {
 	invert = false;
 
+	special_type = SHADER_SPECIAL_TYPE_BUMP;
+
 	/* this input is used by the user, but after graph transform it is no longer
 	 * used and moved to sampler center/x/y instead */
 	add_input("Height", SHADER_SOCKET_FLOAT);
@@ -3987,7 +4012,7 @@ void RGBCurvesNode::compile(OSLCompiler& compiler)
 {
 	float ramp[RAMP_TABLE_SIZE][3];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp[i][0] = curves[i].x;
 		ramp[i][1] = curves[i].y;
 		ramp[i][2] = curves[i].z;
@@ -4025,7 +4050,7 @@ void VectorCurvesNode::compile(OSLCompiler& compiler)
 {
 	float ramp[RAMP_TABLE_SIZE][3];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp[i][0] = curves[i].x;
 		ramp[i][1] = curves[i].y;
 		ramp[i][2] = curves[i].z;
@@ -4075,7 +4100,7 @@ void RGBRampNode::compile(OSLCompiler& compiler)
 	float ramp_color[RAMP_TABLE_SIZE][3];
 	float ramp_alpha[RAMP_TABLE_SIZE];
 
-	for (int i = 0; i < RAMP_TABLE_SIZE; ++i) {
+	for(int i = 0; i < RAMP_TABLE_SIZE; ++i) {
 		ramp_color[i][0] = ramp[i].x;
 		ramp_color[i][1] = ramp[i].y;
 		ramp_color[i][2] = ramp[i].z;
@@ -4122,7 +4147,7 @@ OSLScriptNode::OSLScriptNode()
 	special_type = SHADER_SPECIAL_TYPE_SCRIPT;
 }
 
-void OSLScriptNode::compile(SVMCompiler& compiler)
+void OSLScriptNode::compile(SVMCompiler& /*compiler*/)
 {
 	/* doesn't work for SVM, obviously ... */
 }
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 0ec0fce512f..7ec20f0879b 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -55,13 +55,28 @@ public:
 
 /* Nodes */
 
+/* Any node which uses image manager's slot should be a subclass of this one. */
+class ImageSlotNode : public ShaderNode {
+public:
+	ImageSlotNode(const char *name_) : ShaderNode(name_) {
+		special_type = SHADER_SPECIAL_TYPE_IMAGE_SLOT;
+	}
+	int slot;
+};
+
 class TextureNode : public ShaderNode {
 public:
 	TextureNode(const char *name_) : ShaderNode(name_) {}
 	TextureMapping tex_mapping;
 };
 
-class ImageTextureNode : public TextureNode {
+class ImageSlotTextureNode : public ImageSlotNode {
+public:
+	ImageSlotTextureNode(const char *name_) : ImageSlotNode(name_) {}
+	TextureMapping tex_mapping;
+};
+
+class ImageTextureNode : public ImageSlotTextureNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(ImageTextureNode)
 	~ImageTextureNode();
@@ -69,7 +84,6 @@ public:
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 
 	ImageManager *image_manager;
-	int slot;
 	int is_float;
 	bool is_linear;
 	bool use_alpha;
@@ -85,15 +99,15 @@ public:
 	static ShaderEnum projection_enum;
 };
 
-class EnvironmentTextureNode : public TextureNode {
+class EnvironmentTextureNode : public ImageSlotTextureNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(EnvironmentTextureNode)
 	~EnvironmentTextureNode();
 	ShaderNode *clone() const;
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	ImageManager *image_manager;
-	int slot;
 	int is_float;
 	bool is_linear;
 	bool use_alpha;
@@ -111,6 +125,8 @@ class SkyTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(SkyTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	float3 sun_direction;
 	float turbidity;
 	float ground_albedo;
@@ -128,6 +144,8 @@ class GradientTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(GradientTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 	static ShaderEnum type_enum;
 };
@@ -141,6 +159,8 @@ class VoronoiTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(VoronoiTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring coloring;
 
 	static ShaderEnum coloring_enum;
@@ -150,6 +170,8 @@ class MusgraveTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(MusgraveTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 
 	static ShaderEnum type_enum;
@@ -159,6 +181,8 @@ class WaveTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(WaveTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	ustring type;
 	static ShaderEnum type_enum;
 };
@@ -167,12 +191,16 @@ class MagicTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(MagicTextureNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
+
 	int depth;
 };
 
 class CheckerTextureNode : public TextureNode {
 public:
 	SHADER_NODE_CLASS(CheckerTextureNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 };
 
 class BrickTextureNode : public TextureNode {
@@ -181,11 +209,14 @@ public:
 	
 	float offset, squash;
 	int offset_frequency, squash_frequency;
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 };
 
 class MappingNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MappingNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	TextureMapping tex_mapping;
 };
@@ -308,6 +339,7 @@ public:
 class HoldoutNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(HoldoutNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class AmbientOcclusionNode : public ShaderNode {
@@ -315,6 +347,7 @@ public:
 	SHADER_NODE_CLASS(AmbientOcclusionNode)
 
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class VolumeNode : public ShaderNode {
@@ -322,6 +355,7 @@ public:
 	SHADER_NODE_CLASS(VolumeNode)
 
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2);
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ClosureType closure;
 };
@@ -369,6 +403,7 @@ public:
 	SHADER_NODE_CLASS(UVMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ustring attribute;
 	bool from_dupli;
@@ -377,6 +412,7 @@ public:
 class LightPathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LightPathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class LightFalloffNode : public ShaderNode {
@@ -388,12 +424,14 @@ public:
 class ObjectInfoNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(ObjectInfoNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class ParticleInfoNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(ParticleInfoNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class HairInfoNode : public ShaderNode {
@@ -402,6 +440,10 @@ public:
 
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
+	virtual int get_feature() {
+		return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+	}
 };
 
 class ValueNode : public ShaderNode {
@@ -436,12 +478,16 @@ public:
 class InvertNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(InvertNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class MixNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MixNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	bool use_clamp;
 
 	ustring type;
@@ -451,41 +497,55 @@ public:
 class CombineRGBNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineRGBNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class CombineHSVNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineHSVNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class CombineXYZNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CombineXYZNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class GammaNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(GammaNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class BrightContrastNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BrightContrastNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class SeparateRGBNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateRGBNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class SeparateHSVNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateHSVNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class SeparateXYZNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(SeparateXYZNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class HSVNode : public ShaderNode {
@@ -512,18 +572,21 @@ class FresnelNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(FresnelNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class LayerWeightNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LayerWeightNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class WireframeNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(WireframeNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 	
 	bool use_pixel_size;
 };
@@ -531,18 +594,21 @@ public:
 class WavelengthNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(WavelengthNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class BlackbodyNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BlackbodyNode)
-	
-	bool has_converter_blackbody() { return true; }
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 };
 
 class MathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(MathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	bool use_clamp;
 
@@ -553,6 +619,7 @@ public:
 class NormalNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(NormalNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
 
 	float3 direction;
 };
@@ -560,6 +627,7 @@ public:
 class VectorMathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorMathNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	ustring type;
 	static ShaderEnum type_enum;
@@ -569,6 +637,8 @@ class VectorTransformNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorTransformNode)
 
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	ustring type;
 	ustring convert_from;
 	ustring convert_to;
@@ -581,6 +651,9 @@ class BumpNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BumpNode)
 	bool has_spatial_varying() { return true; }
+	virtual int get_feature() {
+		return NODE_FEATURE_BUMP;
+	}
 
 	bool invert;
 };
@@ -588,12 +661,18 @@ public:
 class RGBCurvesNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(RGBCurvesNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	float4 curves[RAMP_TABLE_SIZE];
 };
 
 class VectorCurvesNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(VectorCurvesNode)
+
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
+
 	float4 curves[RAMP_TABLE_SIZE];
 };
 
@@ -602,6 +681,7 @@ public:
 	SHADER_NODE_CLASS(RGBRampNode)
 	float4 ramp[RAMP_TABLE_SIZE];
 	bool interpolate;
+	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 };
 
 class SetNormalNode : public ShaderNode {
@@ -630,6 +710,7 @@ public:
 	SHADER_NODE_CLASS(NormalMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	ustring space;
 	static ShaderEnum space_enum;
@@ -642,6 +723,7 @@ public:
 	SHADER_NODE_CLASS(TangentNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_spatial_varying() { return true; }
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	ustring direction_type;
 	static ShaderEnum direction_type_enum;
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 1225125b57e..4a57ac4dff1 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "camera.h"
 #include "device.h"
 #include "light.h"
 #include "mesh.h"
@@ -23,6 +24,7 @@
 #include "scene.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_vector.h"
@@ -104,11 +106,11 @@ void Object::apply_transform(bool apply_to_motion)
 		if(apply_to_motion) {
 			Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-			if (attr) {
+			if(attr) {
 				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
 				float3 *vert_steps = attr->data_float3();
 
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					vert_steps[i] = transform_point(&tfm, vert_steps[i]);
 			}
 
@@ -119,7 +121,7 @@ void Object::apply_transform(bool apply_to_motion)
 				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
 				float3 *normal_steps = attr_N->data_float3();
 
-				for (size_t i = 0; i < steps_size; i++)
+				for(size_t i = 0; i < steps_size; i++)
 					normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
 			}
 		}
@@ -146,12 +148,12 @@ void Object::apply_transform(bool apply_to_motion)
 		if(apply_to_motion) {
 			Attribute *curve_attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-			if (curve_attr) {
+			if(curve_attr) {
 				/* apply transform to motion curve keys */
 				size_t steps_size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
 				float4 *key_steps = curve_attr->data_float4();
 
-				for (size_t i = 0; i < steps_size; i++) {
+				for(size_t i = 0; i < steps_size; i++) {
 					float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
 					float radius = key_steps[i].w * scalar;
 
@@ -191,6 +193,7 @@ void Object::tag_update(Scene *scene)
 		}
 	}
 
+	scene->camera->need_flags_update = true;
 	scene->curve_system_manager->need_update = true;
 	scene->mesh_manager->need_update = true;
 	scene->object_manager->need_update = true;
@@ -377,6 +380,8 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 
 void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->objects.size() << " objects.";
+
 	if(!need_update)
 		return;
 	
@@ -402,8 +407,11 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 	}
 }
 
-void ObjectManager::device_update_flags(Device *device, DeviceScene *dscene,
-                                        Scene *scene, Progress& progress)
+void ObjectManager::device_update_flags(Device *device,
+                                        DeviceScene *dscene,
+                                        Scene *scene,
+                                        Progress& /*progress*/,
+                                        bool bounds_valid)
 {
 	if(!need_update && !need_flags_update)
 		return;
@@ -418,9 +426,13 @@ void ObjectManager::device_update_flags(Device *device, DeviceScene *dscene,
 	uint *object_flag = dscene->object_flag.get_data();
 
 	vector<Object *> volume_objects;
+	bool has_volume_objects = false;
 	foreach(Object *object, scene->objects) {
 		if(object->mesh->has_volume) {
-			volume_objects.push_back(object);
+			if(bounds_valid) {
+				volume_objects.push_back(object);
+			}
+			has_volume_objects = true;
 		}
 	}
 
@@ -433,15 +445,23 @@ void ObjectManager::device_update_flags(Device *device, DeviceScene *dscene,
 			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME;
 		}
 
-		foreach(Object *volume_object, volume_objects) {
-			if(object == volume_object) {
-				continue;
-			}
-			if(object->bounds.intersects(volume_object->bounds)) {
-				object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
-				break;
+		if(bounds_valid) {
+			foreach(Object *volume_object, volume_objects) {
+				if(object == volume_object) {
+					continue;
+				}
+				if(object->bounds.intersects(volume_object->bounds)) {
+					object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+					break;
+				}
 			}
 		}
+		else if(has_volume_objects) {
+			/* Not really valid, but can't make more reliable in the case
+			 * of bounds not being up to date.
+			 */
+			object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+		}
 		++object_index;
 	}
 
@@ -474,6 +494,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 	bool apply_to_motion = need_motion != Scene::MOTION_PASS;
 #else
 	bool motion_blur = false;
+	bool apply_to_motion = false;
 #endif
 	int i = 0;
 	bool have_instancing = false;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index acc08a0e204..379d1748cdd 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -77,7 +77,11 @@ public:
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
-	void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+	void device_update_flags(Device *device,
+	                         DeviceScene *dscene,
+	                         Scene *scene,
+	                         Progress& progress,
+	                         bool bounds_valid = true);
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 9d6f412d9ce..a02f91ad2cf 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -67,7 +67,7 @@ OSLShaderManager::~OSLShaderManager()
 	texture_system_free();
 }
 
-void OSLShaderManager::reset(Scene *scene)
+void OSLShaderManager::reset(Scene * /*scene*/)
 {
 	shading_system_free();
 	shading_system_init();
@@ -75,6 +75,8 @@ void OSLShaderManager::reset(Scene *scene)
 
 void OSLShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	if(!need_update)
 		return;
 
@@ -211,9 +213,9 @@ void OSLShaderManager::shading_system_init()
 
 			"__unused__",
 			"__unused__",
-			"diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-			"glossy_ancestor",  /* PATH_RAY_GLOSSY_ANCESTOR */
-			"bssrdf_ancestor",  /* PATH_RAY_BSSRDF_ANCESTOR */
+			"diffuse_ancestor",	/* PATH_RAY_DIFFUSE_ANCESTOR */
+			"__unused__",
+			"__unused__",
 			"__unused__",		/* PATH_RAY_SINGLE_PASS_DONE */
 			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
 		};
@@ -414,7 +416,7 @@ string OSLCompiler::compatible_name(ShaderNode *node, ShaderInput *input)
 	
 	/* if output exists with the same name, add "In" suffix */
 	foreach(ShaderOutput *output, node->outputs) {
-		if (strcmp(input->name, output->name)==0) {
+		if(strcmp(input->name, output->name)==0) {
 			sname += "In";
 			break;
 		}
@@ -434,7 +436,7 @@ string OSLCompiler::compatible_name(ShaderNode *node, ShaderOutput *output)
 	
 	/* if input exists with the same name, add "Out" suffix */
 	foreach(ShaderInput *input, node->inputs) {
-		if (strcmp(input->name, output->name)==0) {
+		if(strcmp(input->name, output->name)==0) {
 			sname += "Out";
 			break;
 		}
@@ -860,75 +862,75 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
 
 #else
 
-void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
+void OSLCompiler::add(ShaderNode * /*node*/, const char * /*name*/, bool /*isfilepath*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, float f)
+void OSLCompiler::parameter(const char * /*name*/, float /*f*/)
 {
 }
 
-void OSLCompiler::parameter_color(const char *name, float3 f)
+void OSLCompiler::parameter_color(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_vector(const char *name, float3 f)
+void OSLCompiler::parameter_vector(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_point(const char *name, float3 f)
+void OSLCompiler::parameter_point(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter_normal(const char *name, float3 f)
+void OSLCompiler::parameter_normal(const char * /*name*/, float3 /*f*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, int f)
+void OSLCompiler::parameter(const char * /*name*/, int /*f*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, const char *s)
+void OSLCompiler::parameter(const char * /*name*/, const char * /*s*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, ustring s)
+void OSLCompiler::parameter(const char * /*name*/, ustring /*s*/)
 {
 }
 
-void OSLCompiler::parameter(const char *name, const Transform& tfm)
+void OSLCompiler::parameter(const char * /*name*/, const Transform& /*tfm*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const float f[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const float /*f*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_color_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_color_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_vector_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_vector_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_normal_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_normal_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_point_array(const char *name, const float f[][3], int arraylen)
+void OSLCompiler::parameter_point_array(const char * /*name*/, const float /*f*/[][3], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const int f[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const int /*f*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const char * const s[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const char * const /*s*/[], int /*arraylen*/)
 {
 }
 
-void OSLCompiler::parameter_array(const char *name, const Transform tfm[], int arraylen)
+void OSLCompiler::parameter_array(const char * /*name*/, const Transform /*tfm*/[], int /*arraylen*/)
 {
 }
 
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index f2f154cdab4..8f9e8c6d639 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -19,6 +19,7 @@
 #include "scene.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_map.h"
 #include "util_progress.h"
 #include "util_vector.h"
@@ -92,6 +93,9 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 
 void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->particle_systems.size()
+	        << " particle systems.";
+
 	if(!need_update)
 		return;
 	
@@ -111,7 +115,7 @@ void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->particles.clear();
 }
 
-void ParticleSystemManager::tag_update(Scene *scene)
+void ParticleSystemManager::tag_update(Scene * /*scene*/)
 {
 	need_update = true;
 }
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 524574f096d..19d715d834b 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -160,11 +160,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel() || device->have_error()) return;
 
-	progress.set_status("Updating Images");
-	image_manager->device_update(device, &dscene, progress);
-
-	if(progress.get_cancel() || device->have_error()) return;
-
 	progress.set_status("Updating Background");
 	background->device_update(device, &dscene, this);
 
@@ -185,13 +180,18 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel() || device->have_error()) return;
 
+	progress.set_status("Updating Meshes");
+	mesh_manager->device_update(device, &dscene, this, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
+
 	progress.set_status("Updating Objects Flags");
 	object_manager->device_update_flags(device, &dscene, this, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
 
-	progress.set_status("Updating Meshes");
-	mesh_manager->device_update(device, &dscene, this, progress);
+	progress.set_status("Updating Images");
+	image_manager->device_update(device, &dscene, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 53c3a95903c..851e5ac0b72 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -62,6 +62,7 @@ class DeviceScene {
 public:
 	/* BVH */
 	device_vector<float4> bvh_nodes;
+	device_vector<float4> bvh_leaf_nodes;
 	device_vector<uint> object_node;
 	device_vector<float4> tri_woop;
 	device_vector<uint> prim_type;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 99826aa4349..57c1500628a 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -20,7 +20,10 @@
 #include "buffers.h"
 #include "camera.h"
 #include "device.h"
+#include "graph.h"
 #include "integrator.h"
+#include "mesh.h"
+#include "object.h"
 #include "scene.h"
 #include "session.h"
 #include "bake.h"
@@ -77,6 +80,9 @@ Session::Session(const SessionParams& params_)
 	gpu_need_tonemap = false;
 	pause = false;
 	kernels_loaded = false;
+
+	/* TODO(sergey): Check if it's indeed optimal value for the split kernel. */
+	max_closure_global = 1;
 }
 
 Session::~Session()
@@ -405,6 +411,11 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 		if(tile_buffers.size() == 0)
 			tile_buffers.resize(tile_manager.state.num_tiles, NULL);
 
+		/* In certain circumstances number of tiles in the tile manager could
+		 * be changed. This is not supported by the progressive refine feature.
+		 */
+		assert(tile_buffers.size() == tile_manager.state.num_tiles);
+
 		tilebuffers = tile_buffers[tile.index];
 		if(tilebuffers == NULL) {
 			tilebuffers = new RenderBuffers(tile_device);
@@ -593,6 +604,42 @@ void Session::run_cpu()
 		update_progressive_refine(true);
 }
 
+DeviceRequestedFeatures Session::get_requested_device_features()
+{
+	/* TODO(sergey): Consider moving this to the Scene level. */
+	DeviceRequestedFeatures requested_features;
+	requested_features.experimental = params.experimental;
+	if(!params.background) {
+		requested_features.max_closure = 64;
+		requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+		requested_features.nodes_features = NODE_FEATURE_ALL;
+	}
+	else {
+		requested_features.max_closure = get_max_closure_count();
+		scene->shader_manager->get_requested_features(
+		        scene,
+		        requested_features.max_nodes_group,
+		        requested_features.nodes_features);
+	}
+
+	/* This features are not being tweaked as often as shaders,
+	 * so could be done selective magic for the viewport as well.
+	 */
+	requested_features.use_hair = false;
+	requested_features.use_object_motion = false;
+	requested_features.use_camera_motion = scene->camera->use_motion;
+	foreach(Object *object, scene->objects) {
+		Mesh *mesh = object->mesh;
+		if(mesh->curves.size() > 0) {
+			requested_features.use_hair = true;
+		}
+		requested_features.use_object_motion |= object->use_motion | mesh->use_motion_blur;
+		requested_features.use_camera_motion |= mesh->use_motion_blur;
+	}
+
+	return requested_features;
+}
+
 void Session::load_kernels()
 {
 	thread_scoped_lock scene_lock(scene->mutex);
@@ -600,7 +647,7 @@ void Session::load_kernels()
 	if(!kernels_loaded) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
-		if(!device->load_kernels(params.experimental)) {
+		if(!device->load_kernels(get_requested_device_features())) {
 			string message = device->error_message();
 			if(message.empty())
 				message = "Failed loading render kernel, see console for errors";
@@ -778,13 +825,21 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	string status, substatus;
 
 	if(!params.progressive) {
-		bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
-		bool is_multidevice = params.device.multi_devices.size() > 1;
-		bool is_cpu = params.device.type == DEVICE_CPU;
+		const int progress_sample = progress.get_sample(), num_samples = tile_manager.num_samples;
+		const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
+		const bool is_multidevice = params.device.multi_devices.size() > 1;
+		const bool is_cpu = params.device.type == DEVICE_CPU;
+		const bool is_last_tile = (num_samples * num_tiles - progress_sample) < num_samples;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) {
+		if((is_gpu && !is_multidevice && !device->info.use_split_kernel) ||
+		   (is_cpu && (num_tiles == 1 || is_last_tile)))
+		{
+			/* When using split-kernel (OpenCL) each thread in a tile will be working on a different
+			 * sample. Can't display sample number when device uses split-kernel
+			 */
+
 			/* when rendering on GPU multithreading happens within single tile, as in
 			 * tiles are handling sequentially and in this case we could display
 			 * currently rendering sample number
@@ -792,17 +847,21 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 * also display the info on CPU, when using 1 tile only
 			 */
 
-			int sample = progress.get_sample(), num_samples = tile_manager.num_samples;
-
+			int status_sample = progress_sample;
 			if(tile > 1) {
 				/* sample counter is global for all tiles, subtract samples
 				 * from already finished tiles to get sample counter for
 				 * current tile only
 				 */
-				sample -= (tile - 1) * num_samples;
+				if(is_cpu && is_last_tile && num_tiles > 1) {
+					status_sample = num_samples - (num_samples * num_tiles - progress_sample);
+				}
+				else {
+					status_sample -= (tile - 1) * num_samples;
+				}
 			}
 
-			substatus += string_printf(", Sample %d/%d", sample, num_samples);
+			substatus += string_printf(", Sample %d/%d", status_sample, num_samples);
 		}
 	}
 	else if(tile_manager.num_samples == USHRT_MAX)
@@ -852,6 +911,7 @@ void Session::path_trace()
 	task.update_progress_sample = function_bind(&Session::update_progress_sample, this);
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
+	task.requested_tile_size = params.tile_size;
 
 	device->task_add(task);
 }
@@ -889,9 +949,9 @@ bool Session::update_progressive_refine(bool cancel)
 
 	double current_time = time_dt();
 
-	if (current_time - last_update_time < params.progressive_update_timeout) {
+	if(current_time - last_update_time < params.progressive_update_timeout) {
 		/* if last sample was processed, we need to write buffers anyway  */
-		if (!write)
+		if(!write && sample != 1)
 			return false;
 	}
 
@@ -901,10 +961,14 @@ bool Session::update_progressive_refine(bool cancel)
 			rtile.buffers = buffers;
 			rtile.sample = sample;
 
-			if(write)
-				write_render_tile_cb(rtile);
-			else
-				update_render_tile_cb(rtile);
+			if(write) {
+				if(write_render_tile_cb)
+					write_render_tile_cb(rtile);
+			}
+			else {
+				if(update_render_tile_cb)
+					update_render_tile_cb(rtile);
+			}
 		}
 	}
 
@@ -927,4 +991,15 @@ void Session::device_free()
 	 */
 }
 
+int Session::get_max_closure_count()
+{
+	int max_closures = 0;
+	for(int i = 0; i < scene->shaders.size(); i++) {
+		int num_closures = scene->shaders[i]->graph->get_num_closures();
+		max_closures = max(max_closures, num_closures);
+	}
+	max_closure_global = max(max_closure_global, max_closures);
+	return max_closure_global;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c77652d3722..c669bccd34b 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -32,6 +32,7 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
+class DeviceRequestedFeatures;
 class DisplayBuffer;
 class Progress;
 class RenderBuffers;
@@ -125,8 +126,8 @@ public:
 	TileManager tile_manager;
 	Stats stats;
 
-	boost::function<void(RenderTile&)> write_render_tile_cb;
-	boost::function<void(RenderTile&)> update_render_tile_cb;
+	function<void(RenderTile&)> write_render_tile_cb;
+	function<void(RenderTile&)> update_render_tile_cb;
 
 	Session(const SessionParams& params);
 	~Session();
@@ -204,6 +205,16 @@ protected:
 	bool update_progressive_refine(bool cancel);
 
 	vector<RenderBuffers *> tile_buffers;
+
+	DeviceRequestedFeatures get_requested_device_features();
+
+	/* ** Split kernel routines ** */
+
+	/* Maximumnumber of closure during session lifetime. */
+	int max_closure_global;
+
+	/* Get maximum number of closures to be used in kernel. */
+	int get_max_closure_count();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 5899c562f72..31be2a3d3f4 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -15,7 +15,7 @@
  */
 
 #include "background.h"
-#include "blackbody.h"
+#include "camera.h"
 #include "device.h"
 #include "graph.h"
 #include "light.h"
@@ -32,6 +32,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+vector<float> ShaderManager::beckmann_table;
+
 /* Beckmann sampling precomputed table, see bsdf_microfacet.h */
 
 /* 2D slope distribution (alpha = 1.0) */
@@ -146,7 +148,6 @@ Shader::Shader()
 	has_surface_transparent = false;
 	has_surface_emission = false;
 	has_surface_bssrdf = false;
-	has_converter_blackbody = false;
 	has_volume = false;
 	has_displacement = false;
 	has_bssrdf_bump = false;
@@ -240,7 +241,6 @@ void Shader::tag_used(Scene *scene)
 ShaderManager::ShaderManager()
 {
 	need_update = true;
-	blackbody_table_offset = TABLE_OFFSET_INVALID;
 	beckmann_table_offset = TABLE_OFFSET_INVALID;
 }
 
@@ -252,6 +252,8 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 {
 	ShaderManager *manager;
 
+	(void)shadingsystem;  /* Ignored when built without OSL. */
+
 #ifdef WITH_OSL
 	if(shadingsystem == SHADINGSYSTEM_OSL)
 		manager = new OSLShaderManager();
@@ -321,7 +323,10 @@ void ShaderManager::device_update_shaders_used(Scene *scene)
 		scene->shaders[light->shader]->used = true;
 }
 
-void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void ShaderManager::device_update_common(Device *device,
+                                         DeviceScene *dscene,
+                                         Scene *scene,
+                                         Progress& /*progress*/)
 {
 	device->tex_free(dscene->shader_flag);
 	dscene->shader_flag.clear();
@@ -332,7 +337,6 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 	uint shader_flag_size = scene->shaders.size()*4;
 	uint *shader_flag = dscene->shader_flag.resize(shader_flag_size);
 	uint i = 0;
-	bool has_converter_blackbody = false;
 	bool has_volumes = false;
 
 	foreach(Shader *shader, scene->shaders) {
@@ -359,8 +363,6 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
-		if(shader->has_converter_blackbody)
-			has_converter_blackbody = true;
 		if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
 			flag |= SD_VOLUME_EQUIANGULAR;
 		if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
@@ -384,26 +386,16 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 
 	device->tex_alloc("__shader_flag", dscene->shader_flag);
 
-	/* blackbody lookup table */
+	/* lookup tables */
 	KernelTables *ktables = &dscene->data.tables;
-	
-	if(has_converter_blackbody && blackbody_table_offset == TABLE_OFFSET_INVALID) {
-		if(blackbody_table.size() == 0) {
-			blackbody_table = blackbody_table_build();
-		}
-		blackbody_table_offset = scene->lookup_tables->add_table(dscene, blackbody_table);
-		
-		ktables->blackbody_offset = (int)blackbody_table_offset;
-	}
-	else if(!has_converter_blackbody && blackbody_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(blackbody_table_offset);
-		blackbody_table_offset = TABLE_OFFSET_INVALID;
-	}
 
 	/* beckmann lookup table */
 	if(beckmann_table_offset == TABLE_OFFSET_INVALID) {
 		if(beckmann_table.size() == 0) {
-			beckmann_table_build(beckmann_table);
+			thread_scoped_lock lock(lookup_table_mutex);
+			if(beckmann_table.size() == 0) {
+				beckmann_table_build(beckmann_table);
+			}
 		}
 		beckmann_table_offset = scene->lookup_tables->add_table(dscene, beckmann_table);
 		ktables->beckmann_offset = (int)beckmann_table_offset;
@@ -416,11 +408,6 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
 {
-	if(blackbody_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(blackbody_table_offset);
-		blackbody_table_offset = TABLE_OFFSET_INVALID;
-	}
-
 	if(beckmann_table_offset != TABLE_OFFSET_INVALID) {
 		scene->lookup_tables->remove_table(beckmann_table_offset);
 		beckmann_table_offset = TABLE_OFFSET_INVALID;
@@ -494,5 +481,45 @@ void ShaderManager::add_default(Scene *scene)
 	}
 }
 
+/* NOTE: Expects max_group and features to be initialized in the callee. */
+void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
+                                                 int& max_group,
+                                                 int& features)
+{
+	foreach(ShaderNode *node, graph->nodes) {
+		max_group = max(max_group, node->get_group());
+		features |= node->get_feature();
+		if(node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
+			BsdfNode *bsdf_node = static_cast<BsdfNode*>(node);
+			if(CLOSURE_IS_VOLUME(bsdf_node->closure)) {
+				features |= NODE_FEATURE_VOLUME;
+			}
+		}
+	}
+}
+
+void ShaderManager::get_requested_features(Scene *scene,
+                                           int& max_group,
+                                           int& features)
+{
+	max_group = NODE_GROUP_LEVEL_0;
+	features = 0;
+	for(int i = 0; i < scene->shaders.size(); i++) {
+		Shader *shader = scene->shaders[i];
+		/* Gather requested features from all the nodes from the graph nodes. */
+		get_requested_graph_features(shader->graph, max_group, features);
+		/* Gather requested features from the graph itself. */
+		if(shader->graph_bump) {
+			get_requested_graph_features(shader->graph_bump,
+			                             max_group,
+			                             features);
+		}
+		ShaderNode *output_node = shader->graph->output();
+		if(output_node->input("Displacement")->link != NULL) {
+			features |= NODE_FEATURE_BUMP;
+		}
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 1dee47c7731..64d45635ef1 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -36,6 +36,7 @@
 #include "util_map.h"
 #include "util_param.h"
 #include "util_string.h"
+#include "util_thread.h"
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
@@ -103,7 +104,6 @@ public:
 	bool has_volume;
 	bool has_displacement;
 	bool has_surface_bssrdf;
-	bool has_converter_blackbody;
 	bool has_bssrdf_bump;
 	bool has_heterogeneous_volume;
 	bool has_object_dependency;
@@ -165,17 +165,25 @@ public:
 	 * have any shader assigned explicitly */
 	static void add_default(Scene *scene);
 
+	/* Selective nodes compilation. */
+	void get_requested_features(Scene *scene,
+	                            int& max_group,
+	                            int& features);
+
 protected:
 	ShaderManager();
 
 	typedef unordered_map<ustring, uint, ustringHash> AttributeIDMap;
 	AttributeIDMap unique_attribute_id;
 
-	vector<float> blackbody_table;
-	vector<float> beckmann_table;
+	thread_mutex lookup_table_mutex;
+	static vector<float> beckmann_table;
 
-	size_t blackbody_table_offset;
 	size_t beckmann_table_offset;
+
+	void get_requested_graph_features(ShaderGraph *graph,
+	                                  int& max_group,
+	                                  int& features);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/sky_model.cpp b/intern/cycles/render/sky_model.cpp
index adb07d9e288..c8a5dbe55e0 100644
--- a/intern/cycles/render/sky_model.cpp
+++ b/intern/cycles/render/sky_model.cpp
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -40,24 +40,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -81,7 +81,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -110,7 +110,7 @@ CCL_NAMESPACE_BEGIN
 //   Some macro definitions that occur elsewhere in ART, and that have to be
 //   replicated to make this a stand-alone module.
 
-#ifndef MATH_PI 
+#ifndef MATH_PI
 #define MATH_PI                     3.141592653589793
 #endif
 
@@ -138,250 +138,231 @@ typedef const double *ArHosekSkyModel_Radiance_Dataset;
 // internal functions
 
 static void ArHosekSkyModel_CookConfiguration(
-        ArHosekSkyModel_Dataset       dataset,
-        ArHosekSkyModelConfiguration  config, 
-        double                        turbidity, 
-        double                        albedo, 
-        double                        solar_elevation
-        )
+        ArHosekSkyModel_Dataset dataset,
+        ArHosekSkyModelConfiguration config,
+        double turbidity,
+        double albedo,
+        double solar_elevation)
 {
-    const double  * elev_matrix;
-
-    int     int_turbidity = (int)turbidity;
-    double  turbidity_rem = turbidity - (double)int_turbidity;
-
-    solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-    // alb 0 low turb
-
-    elev_matrix = dataset + ( 9 * 6 * (int_turbidity-1) );
-    
-    
-    for( unsigned int i = 0; i < 9; ++i )
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] = 
-        (1.0-albedo) * (1.0 - turbidity_rem) 
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	const double  * elev_matrix;
 
-    // alb 1 low turb
-    elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity-1));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (albedo) * (1.0 - turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	int int_turbidity = (int)turbidity;
+	double turbidity_rem = turbidity - (double)int_turbidity;
 
-    if(int_turbidity == 10)
-        return;
-
-    // alb 0 high turb
-    elev_matrix = dataset + (9*6*(int_turbidity));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (1.0-albedo) * (turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
-    }
+	solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
+
+	// alb 0 low turb
 
-    // alb 1 high turb
-    elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity));
-    for(unsigned int i = 0; i < 9; ++i)
-    {
-        //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-        config[i] += 
-        (albedo) * (turbidity_rem)
-        * ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  + 
-           5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
-           10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
-           10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
-           5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
-           pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	elev_matrix = dataset + ( 9 * 6 * (int_turbidity-1));
+
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] =
+			(1.0-albedo) * (1.0 - turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
     }
+
+	// alb 1 low turb
+	elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity-1));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(albedo) * (1.0 - turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
+
+	if(int_turbidity == 10)
+		return;
+
+	// alb 0 high turb
+	elev_matrix = dataset + (9*6*(int_turbidity));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(1.0-albedo) * (turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
+
+	// alb 1 high turb
+	elev_matrix = dataset + (9*6*10 + 9*6*(int_turbidity));
+	for(unsigned int i = 0; i < 9; ++i) {
+		//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+		config[i] +=
+			(albedo) * (turbidity_rem)
+			* ( pow(1.0-solar_elevation, 5.0) * elev_matrix[i]  +
+			    5.0  * pow(1.0-solar_elevation, 4.0) * solar_elevation * elev_matrix[i+9] +
+			    10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[i+18] +
+			    10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[i+27] +
+			    5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[i+36] +
+			    pow(solar_elevation, 5.0)  * elev_matrix[i+45]);
+	}
 }
 
 static double ArHosekSkyModel_CookRadianceConfiguration(
-        ArHosekSkyModel_Radiance_Dataset  dataset, 
-        double                            turbidity, 
-        double                            albedo, 
-        double                            solar_elevation
-        )
+        ArHosekSkyModel_Radiance_Dataset dataset,
+        double turbidity,
+        double albedo,
+        double solar_elevation)
 {
-    const double* elev_matrix;
-
-    int int_turbidity = (int)turbidity;
-    double turbidity_rem = turbidity - (double)int_turbidity;
-    double res;
-    solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-    // alb 0 low turb
-    elev_matrix = dataset + (6*(int_turbidity-1));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res = (1.0-albedo) * (1.0 - turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-    // alb 1 low turb
-    elev_matrix = dataset + (6*10 + 6*(int_turbidity-1));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (albedo) * (1.0 - turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-    if(int_turbidity == 10)
-        return res;
-
-    // alb 0 high turb
-    elev_matrix = dataset + (6*(int_turbidity));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (1.0-albedo) * (turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-    // alb 1 high turb
-    elev_matrix = dataset + (6*10 + 6*(int_turbidity));
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    res += (albedo) * (turbidity_rem) *
-        ( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
-         5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
-         10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-    return res;
+	const double* elev_matrix;
+
+	int int_turbidity = (int)turbidity;
+	double turbidity_rem = turbidity - (double)int_turbidity;
+	double res;
+	solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
+
+	// alb 0 low turb
+	elev_matrix = dataset + (6*(int_turbidity-1));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res = (1.0-albedo) * (1.0 - turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+
+	// alb 1 low turb
+	elev_matrix = dataset + (6*10 + 6*(int_turbidity-1));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (albedo) * (1.0 - turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+	if(int_turbidity == 10)
+		return res;
+
+	// alb 0 high turb
+	elev_matrix = dataset + (6*(int_turbidity));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (1.0-albedo) * (turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+
+	// alb 1 high turb
+	elev_matrix = dataset + (6*10 + 6*(int_turbidity));
+	//(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
+	res += (albedo) * (turbidity_rem) *
+		( pow(1.0-solar_elevation, 5.0) * elev_matrix[0] +
+		  5.0*pow(1.0-solar_elevation, 4.0)*solar_elevation * elev_matrix[1] +
+		  10.0*pow(1.0-solar_elevation, 3.0)*pow(solar_elevation, 2.0) * elev_matrix[2] +
+		  10.0*pow(1.0-solar_elevation, 2.0)*pow(solar_elevation, 3.0) * elev_matrix[3] +
+		  5.0*(1.0-solar_elevation)*pow(solar_elevation, 4.0) * elev_matrix[4] +
+		  pow(solar_elevation, 5.0) * elev_matrix[5]);
+	return res;
 }
 
 static double ArHosekSkyModel_GetRadianceInternal(
-        ArHosekSkyModelConfiguration  configuration, 
-        double                        theta, 
-        double                        gamma
-        )
+        ArHosekSkyModelConfiguration configuration,
+        double theta,
+        double gamma)
 {
-    const double expM = exp(configuration[4] * gamma);
-    const double rayM = cos(gamma)*cos(gamma);
-    const double mieM = (1.0 + cos(gamma)*cos(gamma)) / pow((1.0 + configuration[8]*configuration[8] - 2.0*configuration[8]*cos(gamma)), 1.5);
-    const double zenith = sqrt(cos(theta));
+	const double expM = exp(configuration[4] * gamma);
+	const double rayM = cos(gamma)*cos(gamma);
+	const double mieM = (1.0 + cos(gamma)*cos(gamma)) / pow((1.0 + configuration[8]*configuration[8] - 2.0*configuration[8]*cos(gamma)), 1.5);
+	const double zenith = sqrt(cos(theta));
 
-    return (1.0 + configuration[0] * exp(configuration[1] / (cos(theta) + 0.01))) *
+	return (1.0 + configuration[0] * exp(configuration[1] / (cos(theta) + 0.01))) *
             (configuration[2] + configuration[3] * expM + configuration[5] * rayM + configuration[6] * mieM + configuration[7] * zenith);
 }
 
-void arhosekskymodelstate_free(
-        ArHosekSkyModelState  * state
-        )
+void arhosekskymodelstate_free(ArHosekSkyModelState  * state)
 {
-    free(state);
+	free(state);
 }
 
-double arhosekskymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta, 
-        double                  gamma, 
-        double                  wavelength
-        )
+double arhosekskymodel_radiance(ArHosekSkyModelState  *state,
+                                double theta,
+                                double gamma,
+                                double wavelength)
 {
-    int low_wl = (int)((wavelength - 320.0) / 40.0);
-
-    if ( low_wl < 0 || low_wl >= 11 )
-        return 0.0f;
-
-    double interp = fmod((wavelength - 320.0 ) / 40.0, 1.0);
-
-    double val_low = 
-          ArHosekSkyModel_GetRadianceInternal(
-                state->configs[low_wl],
-                theta,
-                gamma
-              )
-        * state->radiances[low_wl]
-        * state->emission_correction_factor_sky[low_wl];
-
-    if ( interp < 1e-6 )
-        return val_low;
-
-    double result = ( 1.0 - interp ) * val_low;
-
-    if ( low_wl+1 < 11 )
-    {
-        result +=
-              interp
-            * ArHosekSkyModel_GetRadianceInternal(
-                    state->configs[low_wl+1],
-                    theta,
-                    gamma
-                  )
-            * state->radiances[low_wl+1]
-            * state->emission_correction_factor_sky[low_wl+1];
-    }
-
-    return result;
+	int low_wl = (int)((wavelength - 320.0) / 40.0);
+
+	if(low_wl < 0 || low_wl >= 11)
+	    return 0.0f;
+
+	double interp = fmod((wavelength - 320.0 ) / 40.0, 1.0);
+
+	double val_low =
+		ArHosekSkyModel_GetRadianceInternal(
+		        state->configs[low_wl],
+		        theta,
+		        gamma)
+		* state->radiances[low_wl]
+		* state->emission_correction_factor_sky[low_wl];
+
+	if(interp < 1e-6)
+		return val_low;
+
+	double result = ( 1.0 - interp ) * val_low;
+
+    if(low_wl+1 < 11) {
+	    result +=
+		    interp
+		    * ArHosekSkyModel_GetRadianceInternal(
+		            state->configs[low_wl+1],
+		            theta,
+		            gamma)
+		    * state->radiances[low_wl+1]
+		    * state->emission_correction_factor_sky[low_wl+1];
+	}
+
+	return result;
 }
 
 
 // xyz and rgb versions
 
-ArHosekSkyModelState  * arhosek_xyz_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        )
+ArHosekSkyModelState * arhosek_xyz_skymodelstate_alloc_init(
+        const double turbidity,
+        const double albedo,
+        const double elevation)
 {
-    ArHosekSkyModelState  * state = ALLOC(ArHosekSkyModelState);
-
-    state->solar_radius = TERRESTRIAL_SOLAR_RADIUS;
-    state->turbidity    = turbidity;
-    state->albedo       = albedo;
-    state->elevation    = elevation;
-    
-    for( unsigned int channel = 0; channel < 3; ++channel )
-    {
-        ArHosekSkyModel_CookConfiguration(
-            datasetsXYZ[channel], 
-            state->configs[channel], 
-            turbidity, 
-            albedo, 
-            elevation
-            );
-        
-        state->radiances[channel] = 
-        ArHosekSkyModel_CookRadianceConfiguration(
-            datasetsXYZRad[channel],
-            turbidity, 
-            albedo,
-            elevation
-            );
+	ArHosekSkyModelState  * state = ALLOC(ArHosekSkyModelState);
+
+	state->solar_radius = TERRESTRIAL_SOLAR_RADIUS;
+	state->turbidity = turbidity;
+	state->albedo = albedo;
+	state->elevation = elevation;
+
+    for(unsigned int channel = 0; channel < 3; ++channel) {
+		ArHosekSkyModel_CookConfiguration(
+		    datasetsXYZ[channel],
+		    state->configs[channel],
+		    turbidity,
+		    albedo,
+		    elevation);
+
+		state->radiances[channel] =
+		ArHosekSkyModel_CookRadianceConfiguration(
+		    datasetsXYZRad[channel],
+		    turbidity,
+		    albedo,
+		    elevation);
     }
-    
+
     return state;
 }
 
diff --git a/intern/cycles/render/sky_model.h b/intern/cycles/render/sky_model.h
index 3814543c8b6..237e4e61bf5 100644
--- a/intern/cycles/render/sky_model.h
+++ b/intern/cycles/render/sky_model.h
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -41,24 +41,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -82,7 +82,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -96,9 +96,9 @@ an updated version of this code has been published!
 /*
 
 This code is taken from ART, a rendering research system written in a
-mix of C99 / Objective C. Since ART is not a small system and is intended to 
-be inter-operable with other libraries, and since C does not have namespaces, 
-the structures and functions in ART all have to have somewhat wordy 
+mix of C99 / Objective C. Since ART is not a small system and is intended to
+be inter-operable with other libraries, and since C does not have namespaces,
+the structures and functions in ART all have to have somewhat wordy
 canonical names that begin with Ar.../ar..., like those seen in this example.
 
 Usage information:
@@ -119,7 +119,7 @@ snippet, we assume that 'albedo' is defined as
 
     double  albedo[num_channels];
 
-with a ground albedo value between [0,1] for each channel. The solar elevation  
+with a ground albedo value between [0,1] for each channel. The solar elevation
 is given in radians.
 
     for ( unsigned int i = 0; i < num_channels; i++ )
@@ -130,11 +130,11 @@ is given in radians.
                   solarElevation
                 );
 
-Note that starting with version 1.3, there is also a second initialisation 
-function which generates skydome states for different solar emission spectra 
+Note that starting with version 1.3, there is also a second initialisation
+function which generates skydome states for different solar emission spectra
 and solar radii: 'arhosekskymodelstate_alienworld_alloc_init()'.
 
-See the notes about the "Alien World" functionality provided further down for a 
+See the notes about the "Alien World" functionality provided further down for a
 discussion of the usefulness and limits of that second initalisation function.
 Sky model states that have been initialized with either function behave in a
 completely identical fashion during use and cleanup.
@@ -155,7 +155,7 @@ on the skydome determined via the angles theta and gamma works as follows:
                 gamma,
                 channel_center[i]
               );
-              
+
 The variable "channel_center" is assumed to hold the channel center wavelengths
 for each of the num_channels samples of the spectrum we are building.
 
@@ -188,114 +188,114 @@ by calling arhosek_rgb_skymodelstate_alloc_init.
 Solar Radiance Function
 -----------------------
 
-For each position on the solar disc, this function returns the entire radiance 
-one sees - direct emission, as well as in-scattered light in the area of the 
-solar disc. The latter is important for low solar elevations - nice images of 
-the setting sun would not be possible without this. This is also the reason why 
-this function, just like the regular sky dome model evaluation function, needs 
-access to the sky dome data structures, as these provide information on 
+For each position on the solar disc, this function returns the entire radiance
+one sees - direct emission, as well as in-scattered light in the area of the
+solar disc. The latter is important for low solar elevations - nice images of
+the setting sun would not be possible without this. This is also the reason why
+this function, just like the regular sky dome model evaluation function, needs
+access to the sky dome data structures, as these provide information on
 in-scattered radiance.
 
 CAVEAT #1: in this release, this function is only provided in spectral form!
            RGB/XYZ versions to follow at a later date.
 
-CAVEAT #2: (fixed from release 1.3 onwards) 
+CAVEAT #2: (fixed from release 1.3 onwards)
 
 CAVEAT #3: limb darkening renders the brightness of the solar disc
            inhomogeneous even for high solar elevations - only taking a single
            sample at the centre of the sun will yield an incorrect power
            estimate for the solar disc! Always take multiple random samples
            across the entire solar disc to estimate its power!
-           
+
 CAVEAT #4: in this version, the limb darkening calculations still use a fairly
-           computationally expensive 5th order polynomial that was directly 
+           computationally expensive 5th order polynomial that was directly
            taken from astronomical literature. For the purposes of Computer
-           Graphics, this is needlessly accurate, though, and will be replaced 
+           Graphics, this is needlessly accurate, though, and will be replaced
            by a cheaper approximation in a future release.
 
 "Alien World" functionality
 ---------------------------
 
-The Hosek sky model can be used to roughly (!) predict the appearance of 
-outdoor scenes on earth-like planets, i.e. planets of a similar size and 
-atmospheric make-up. Since the spectral version of our model predicts sky dome 
-luminance patterns and solar radiance independently for each waveband, and 
-since the intensity of each waveband is solely dependent on the input radiance 
-from the star that the world in question is orbiting, it is trivial to re-scale 
+The Hosek sky model can be used to roughly (!) predict the appearance of
+outdoor scenes on earth-like planets, i.e. planets of a similar size and
+atmospheric make-up. Since the spectral version of our model predicts sky dome
+luminance patterns and solar radiance independently for each waveband, and
+since the intensity of each waveband is solely dependent on the input radiance
+from the star that the world in question is orbiting, it is trivial to re-scale
 the wavebands to match a different star radiance.
 
-At least in theory, the spectral version of the model has always been capable 
-of this sort of thing, and the actual sky dome and solar radiance models were 
+At least in theory, the spectral version of the model has always been capable
+of this sort of thing, and the actual sky dome and solar radiance models were
 actually not altered at all in this release. All we did was to add some support
-functionality for doing this more easily with the existing data and functions, 
+functionality for doing this more easily with the existing data and functions,
 and to add some explanations.
 
 Just use 'arhosekskymodelstate_alienworld_alloc_init()' to initialise the sky
-model states (you will have to provide values for star temperature and solar 
-intensity compared to the terrestrial sun), and do everything else as you 
+model states (you will have to provide values for star temperature and solar
+intensity compared to the terrestrial sun), and do everything else as you
 did before.
 
-CAVEAT #1: we assume the emission of the star that illuminates the alien world 
-           to be a perfect blackbody emission spectrum. This is never entirely 
-           realistic - real star emission spectra are considerably more complex 
-           than this, mainly due to absorption effects in the outer layers of 
-           stars. However, blackbody spectra are a reasonable first assumption 
-           in a usage scenario like this, where 100% accuracy is simply not 
-           necessary: for rendering purposes, there are likely no visible 
-           differences between a highly accurate solution based on a more 
+CAVEAT #1: we assume the emission of the star that illuminates the alien world
+           to be a perfect blackbody emission spectrum. This is never entirely
+           realistic - real star emission spectra are considerably more complex
+           than this, mainly due to absorption effects in the outer layers of
+           stars. However, blackbody spectra are a reasonable first assumption
+           in a usage scenario like this, where 100% accuracy is simply not
+           necessary: for rendering purposes, there are likely no visible
+           differences between a highly accurate solution based on a more
            involved simulation, and this approximation.
 
 CAVEAT #2: we always use limb darkening data from our own sun to provide this
-           "appearance feature", even for suns of strongly different 
-           temperature. Which is presumably not very realistic, but (as with 
-           the unaltered blackbody spectrum from caveat #1) probably not a bad 
+           "appearance feature", even for suns of strongly different
+           temperature. Which is presumably not very realistic, but (as with
+           the unaltered blackbody spectrum from caveat #1) probably not a bad
            first guess, either. If you need more accuracy than we provide here,
            please make inquiries with a friendly astro-physicst of your choice.
 
-CAVEAT #3: you have to provide a value for the solar intensity of the star 
-           which illuminates the alien world. For this, please bear in mind  
-           that there is very likely a comparatively tight range of absolute  
-           solar irradiance values for which an earth-like planet with an  
-           atmosphere like the one we assume in our model can exist in the  
+CAVEAT #3: you have to provide a value for the solar intensity of the star
+           which illuminates the alien world. For this, please bear in mind
+           that there is very likely a comparatively tight range of absolute
+           solar irradiance values for which an earth-like planet with an
+           atmosphere like the one we assume in our model can exist in the
            first place!
-            
-           Too much irradiance, and the atmosphere probably boils off into 
-           space, too little, it freezes. Which means that stars of 
-           considerably different emission colour than our sun will have to be 
-           fairly different in size from it, to still provide a reasonable and 
-           inhabitable amount of irradiance. Red stars will need to be much 
-           larger than our sun, while white or blue stars will have to be 
-           comparatively tiny. The initialisation function handles this and 
+
+           Too much irradiance, and the atmosphere probably boils off into
+           space, too little, it freezes. Which means that stars of
+           considerably different emission colour than our sun will have to be
+           fairly different in size from it, to still provide a reasonable and
+           inhabitable amount of irradiance. Red stars will need to be much
+           larger than our sun, while white or blue stars will have to be
+           comparatively tiny. The initialisation function handles this and
            computes a plausible solar radius for a given emission spectrum. In
            terms of absolute radiometric values, you should probably not stray
            all too far from a solar intensity value of 1.0.
 
-CAVEAT #4: although we now support different solar radii for the actual solar 
-           disc, the sky dome luminance patterns are *not* parameterised by 
-           this value - i.e. the patterns stay exactly the same for different 
-           solar radii! Which is of course not correct. But in our experience, 
-           solar discs up to several degrees in diameter (! - our own sun is 
-           half a degree across) do not cause the luminance patterns on the sky 
-           to change perceptibly. The reason we know this is that we initially 
-           used unrealistically large suns in our brute force path tracer, in 
-           order to improve convergence speeds (which in the beginning were 
-           abysmal). Later, we managed to do the reference renderings much 
-           faster even with realistically small suns, and found that there was 
-           no real difference in skydome appearance anyway. 
-           Conclusion: changing the solar radius should not be over-done, so  
-           close orbits around red supergiants are a no-no. But for the  
-           purposes of getting a fairly credible first impression of what an 
-           alien world with a reasonably sized sun would look like, what we are  
+CAVEAT #4: although we now support different solar radii for the actual solar
+           disc, the sky dome luminance patterns are *not* parameterised by
+           this value - i.e. the patterns stay exactly the same for different
+           solar radii! Which is of course not correct. But in our experience,
+           solar discs up to several degrees in diameter (! - our own sun is
+           half a degree across) do not cause the luminance patterns on the sky
+           to change perceptibly. The reason we know this is that we initially
+           used unrealistically large suns in our brute force path tracer, in
+           order to improve convergence speeds (which in the beginning were
+           abysmal). Later, we managed to do the reference renderings much
+           faster even with realistically small suns, and found that there was
+           no real difference in skydome appearance anyway.
+           Conclusion: changing the solar radius should not be over-done, so
+           close orbits around red supergiants are a no-no. But for the
+           purposes of getting a fairly credible first impression of what an
+           alien world with a reasonably sized sun would look like, what we are
            doing here is probably still o.k.
 
-HINT #1:   if you want to model the sky of an earth-like planet that orbits 
-           a binary star, just super-impose two of these models with solar 
+HINT #1:   if you want to model the sky of an earth-like planet that orbits
+           a binary star, just super-impose two of these models with solar
            intensity of ~0.5 each, and closely spaced solar positions. Light is
            additive, after all. Tattooine, here we come... :-)
 
            P.S. according to Star Wars canon, Tattooine orbits a binary
-           that is made up of a G and K class star, respectively. 
-           So ~5500K and ~4200K should be good first guesses for their 
+           that is made up of a G and K class star, respectively.
+           So ~5500K and ~4200K should be good first guesses for their
            temperature. Just in case you were wondering, after reading the
            previous paragraph.
 */
@@ -316,37 +316,37 @@ typedef double ArHosekSkyModelConfiguration[9];
     ---------------------------
 
     This struct holds the pre-computation data for one particular albedo value.
-    Most fields are self-explanatory, but users should never directly 
-    manipulate any of them anyway. The only consistent way to manipulate such 
-    structs is via the functions 'arhosekskymodelstate_alloc_init' and 
+    Most fields are self-explanatory, but users should never directly
+    manipulate any of them anyway. The only consistent way to manipulate such
+    structs is via the functions 'arhosekskymodelstate_alloc_init' and
     'arhosekskymodelstate_free'.
-    
+
     'emission_correction_factor_sky'
     'emission_correction_factor_sun'
 
-        The original model coefficients were fitted against the emission of 
+        The original model coefficients were fitted against the emission of
         our local sun. If a different solar emission is desired (i.e. if the
-        model is being used to predict skydome appearance for an earth-like 
-        planet that orbits a different star), these correction factors, which 
-        are determined during the alloc_init step, are applied to each waveband 
-        separately (they default to 1.0 in normal usage). This is the simplest 
-        way to retrofit this sort of capability to the existing model. The 
-        different factors for sky and sun are needed since the solar disc may 
+        model is being used to predict skydome appearance for an earth-like
+        planet that orbits a different star), these correction factors, which
+        are determined during the alloc_init step, are applied to each waveband
+        separately (they default to 1.0 in normal usage). This is the simplest
+        way to retrofit this sort of capability to the existing model. The
+        different factors for sky and sun are needed since the solar disc may
         be of a different size compared to the terrestrial sun.
 
 ---------------------------------------------------------------------------- */
 
 typedef struct ArHosekSkyModelState
 {
-    ArHosekSkyModelConfiguration  configs[11];
-    double                        radiances[11];
-    double                        turbidity;
-    double                        solar_radius;
-    double                        emission_correction_factor_sky[11];
-    double                        emission_correction_factor_sun[11];
-    double                        albedo;
-    double                        elevation;
-} 
+	ArHosekSkyModelConfiguration  configs[11];
+	double                        radiances[11];
+	double                        turbidity;
+	double                        solar_radius;
+	double                        emission_correction_factor_sky[11];
+	double                        emission_correction_factor_sun[11];
+	double                        albedo;
+	double                        elevation;
+}
 ArHosekSkyModelState;
 
 /* ----------------------------------------------------------------------------
@@ -358,11 +358,10 @@ ArHosekSkyModelState;
 
 ---------------------------------------------------------------------------- */
 
-ArHosekSkyModelState  * arhosekskymodelstate_alloc_init(
-        const double  solar_elevation,
-        const double  atmospheric_turbidity,
-        const double  ground_albedo
-        );
+ArHosekSkyModelState *arhosekskymodelstate_alloc_init(
+        const double solar_elevation,
+        const double atmospheric_turbidity,
+        const double ground_albedo);
 
 
 /* ----------------------------------------------------------------------------
@@ -375,78 +374,67 @@ ArHosekSkyModelState  * arhosekskymodelstate_alloc_init(
     'solar_intensity' controls the overall brightness of the sky, relative
     to the solar irradiance on Earth. A value of 1.0 yields a sky dome that
     is, on average over the wavelenghts covered in the model (!), as bright
-    as the terrestrial sky in radiometric terms. 
-    
-    Which means that the solar radius has to be adjusted, since the 
-    emissivity of a solar surface with a given temperature is more or less 
-    fixed. So hotter suns have to be smaller to be equally bright as the 
+    as the terrestrial sky in radiometric terms.
+
+    Which means that the solar radius has to be adjusted, since the
+    emissivity of a solar surface with a given temperature is more or less
+    fixed. So hotter suns have to be smaller to be equally bright as the
     terrestrial sun, while cooler suns have to be larger. Note that there are
     limits to the validity of the luminance patterns of the underlying model:
     see the discussion above for more on this. In particular, an alien sun with
     a surface temperature of only 2000 Kelvin has to be very large if it is
-    to be as bright as the terrestrial sun - so large that the luminance 
+    to be as bright as the terrestrial sun - so large that the luminance
     patterns are no longer a really good fit in that case.
-    
+
     If you need information about the solar radius that the model computes
-    for a given temperature (say, for light source sampling purposes), you 
-    have to query the 'solar_radius' variable of the sky model state returned 
+    for a given temperature (say, for light source sampling purposes), you
+    have to query the 'solar_radius' variable of the sky model state returned
     *after* running this function.
 
 ---------------------------------------------------------------------------- */
 
-ArHosekSkyModelState  * arhosekskymodelstate_alienworld_alloc_init(
-        const double  solar_elevation,
-        const double  solar_intensity,
-        const double  solar_surface_temperature_kelvin,
-        const double  atmospheric_turbidity,
-        const double  ground_albedo
-        );
-
-void arhosekskymodelstate_free(
-        ArHosekSkyModelState  * state
-        );
-
-double arhosekskymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta, 
-        double                  gamma, 
-        double                  wavelength
-        );
+ArHosekSkyModelState* arhosekskymodelstate_alienworld_alloc_init(
+        const double solar_elevation,
+        const double solar_intensity,
+        const double solar_surface_temperature_kelvin,
+        const double atmospheric_turbidity,
+        const double ground_albedo);
+
+void arhosekskymodelstate_free(ArHosekSkyModelState  *state);
+
+double arhosekskymodel_radiance(ArHosekSkyModelState *state,
+                                double theta,
+                                double gamma,
+                                double wavelength);
 
 // CIE XYZ and RGB versions
 
 
 ArHosekSkyModelState  * arhosek_xyz_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        );
+        const double turbidity,
+        const double albedo,
+        const double elevation);
 
 
 ArHosekSkyModelState  * arhosek_rgb_skymodelstate_alloc_init(
-        const double  turbidity, 
-        const double  albedo, 
-        const double  elevation
-        );
+        const double turbidity,
+        const double albedo,
+        const double elevation);
 
 
-double arhosek_tristim_skymodel_radiance(
-        ArHosekSkyModelState  * state,
-        double                  theta,
-        double                  gamma, 
-        int                     channel
-        );
+double arhosek_tristim_skymodel_radiance(ArHosekSkyModelState* state,
+                                         double theta,
+                                         double gamma,
+                                         int channel);
 
 //   Delivers the complete function: sky + sun, including limb darkening.
 //   Please read the above description before using this - there are several
 //   caveats!
 
-double arhosekskymodel_solar_radiance(
-        ArHosekSkyModelState      * state,
-        double                      theta,
-        double                      gamma,
-        double                      wavelength
-        );
+double arhosekskymodel_solar_radiance(ArHosekSkyModelState* state,
+                                      double theta,
+                                      double gamma,
+                                      double wavelength);
 
 
 #endif // _SKY_MODEL_H_
diff --git a/intern/cycles/render/sky_model_data.h b/intern/cycles/render/sky_model_data.h
index 4171bd12756..e6f3f761532 100644
--- a/intern/cycles/render/sky_model_data.h
+++ b/intern/cycles/render/sky_model_data.h
@@ -4,7 +4,7 @@ This source is published under the following 3-clause BSD license.
 Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
 All rights reserved.
 
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
     * Redistributions of source code must retain the above copyright
@@ -12,8 +12,8 @@ modification, are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote 
-      products derived from this software without specific prior written 
+    * None of the names of the contributors may be used to endorse or promote
+      products derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
@@ -41,24 +41,24 @@ and the 2013 IEEE CG&A paper
 
        "Adding a Solar Radiance Function to the Hosek Skylight Model"
 
-                                   both by 
+                                   both by
 
                        Lukas Hosek and Alexander Wilkie
                 Charles University in Prague, Czech Republic
 
 
                         Version: 1.4a, February 22nd, 2013
-                        
+
 Version history:
 
 1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters 
+      Removed unnecessary and counter-intuitive solar radius parameters
       from the interface of the colourspace sky dome initialisation functions.
 
 1.4   February 11th, 2013
       Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too 
-      bright: this affected both normal and alien sun scenarios. The 
+      and the sky dome to be off by a factor of about 6. The sun was too
+      bright: this affected both normal and alien sun scenarios. The
       coefficients of the solar radiance function were changed to fix this.
 
 1.3   January 21st, 2013 (not released to the public)
@@ -82,7 +82,7 @@ Version history:
       the result of a simple conversion from spectral data via the CIE 2 degree
       standard observer matching functions. Therefore, after multiplication
       with 683 lm / W, the Y channel now corresponds to luminance in lm.
-     
+
 1.0   May 11th, 2012
       Initial release.
 
@@ -96,15 +96,14 @@ CCL_NAMESPACE_BEGIN
 
 /*
 
-This file contains the coefficient data for the XYZ colour space version of 
+This file contains the coefficient data for the XYZ colour space version of
 the model.
 
 */
 
 // Uses Sep 9 pattern / Aug 23 mean dataset
 
-static const double datasetXYZ1[] =
-{
+static const double datasetXYZ1[] = {
 	// albedo 0, turbidity 1
 	-1.117001e+000,
 	-1.867262e-001,
@@ -3849,15 +3848,13 @@ static const double datasetXYZRad3[] =
 
 
 
-static const double* datasetsXYZ[] =
-{
+static const double* datasetsXYZ[] = {
 	datasetXYZ1,
 	datasetXYZ2,
 	datasetXYZ3
 };
 
-static const double* datasetsXYZRad[] =
-{
+static const double* datasetsXYZRad[] = {
 	datasetXYZRad1,
 	datasetXYZRad2,
 	datasetXYZRad3
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 2e3abfcffb9..d0bd34915df 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -24,6 +24,7 @@
 #include "svm.h"
 
 #include "util_debug.h"
+#include "util_logging.h"
 #include "util_foreach.h"
 #include "util_progress.h"
 
@@ -39,12 +40,14 @@ SVMShaderManager::~SVMShaderManager()
 {
 }
 
-void SVMShaderManager::reset(Scene *scene)
+void SVMShaderManager::reset(Scene * /*scene*/)
 {
 }
 
 void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	if(!need_update)
 		return;
 
@@ -354,7 +357,7 @@ uint SVMCompiler::attribute(AttributeStandard std)
 	return shader_manager->get_attribute_id(std);
 }
 
-bool SVMCompiler::node_skip_input(ShaderNode *node, ShaderInput *input)
+bool SVMCompiler::node_skip_input(ShaderNode * /*node*/, ShaderInput *input)
 {
 	/* nasty exception .. */
 	if(current_type == SHADER_TYPE_DISPLACEMENT && input->link && input->link->parent->name == ustring("bump"))
@@ -390,10 +393,6 @@ void SVMCompiler::generate_node(ShaderNode *node, set<ShaderNode*>& done)
 			current_shader->has_heterogeneous_volume = true;
 	}
 
-	/* detect if we have a blackbody converter, to prepare lookup table */
-	if(node->has_converter_blackbody())
-		current_shader->has_converter_blackbody = true;
-
 	if(node->has_object_dependency()) {
 		current_shader->has_object_dependency = true;
 	}
@@ -713,7 +712,6 @@ void SVMCompiler::compile(Shader *shader, vector<int4>& global_svm_nodes, int in
 	shader->has_surface_transparent = false;
 	shader->has_surface_bssrdf = false;
 	shader->has_bssrdf_bump = false;
-	shader->has_converter_blackbody = false;
 	shader->has_volume = false;
 	shader->has_displacement = false;
 	shader->has_heterogeneous_volume = false;
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index faf7ea3409b..ad3f4866072 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -19,6 +19,7 @@
 #include "tables.h"
 
 #include "util_debug.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -36,6 +37,8 @@ LookupTables::~LookupTables()
 
 void LookupTables::device_update(Device *device, DeviceScene *dscene)
 {
+	VLOG(1) << "Total " << lookup_tables.size() << " lookup tables.";
+
 	if(!need_update)
 		return;
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 675f49ec300..7e68ce84d94 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -28,6 +28,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
 	tile_size = tile_size_;
 	tile_order = tile_order_;
 	start_resolution = start_resolution_;
+	num_samples = num_samples_;
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
 	background = background_;
@@ -234,7 +235,7 @@ bool TileManager::next_tile(Tile& tile, int device)
 {
 	list<Tile>::iterator tile_it;
 	
-	if (background)
+	if(background)
 		tile_it = next_background_tile(device, tile_order);
 	else
 		tile_it = next_viewport_tile(device);
diff --git a/intern/cycles/subd/subd_mesh.cpp b/intern/cycles/subd/subd_mesh.cpp
index 0db20656f39..17a730e5efe 100644
--- a/intern/cycles/subd/subd_mesh.cpp
+++ b/intern/cycles/subd/subd_mesh.cpp
@@ -109,8 +109,8 @@ public:
 		evalctrl.EvalLimitSample<OsdCpuVertexBuffer,OsdCpuVertexBuffer>(coords, evalctx, 0);
 
 		*P_ = make_float3(P[0], P[1], P[2]);
-		if (dPdu_) *dPdu_ = make_float3(dPdv[0], dPdv[1], dPdv[2]);
-		if (dPdv_) *dPdv_ = make_float3(dPdu[0], dPdu[1], dPdu[2]);
+		if(dPdu_) *dPdu_ = make_float3(dPdv[0], dPdv[1], dPdv[2]);
+		if(dPdv_) *dPdv_ = make_float3(dPdu[0], dPdu[1], dPdu[2]);
 
 		/* optimize: skip evaluating derivatives when not needed */
 		/* todo: swapped derivatives, different winding convention? */
@@ -234,7 +234,7 @@ bool OpenSubdMesh::finish()
 
 void OpenSubdMesh::tessellate(DiagSplit *split)
 {
-	if (num_ptex_faces == 0)
+	if(num_ptex_faces == 0)
 		return;
 
 	const int level = 3;
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
index 9ff857e3543..b161a55c15e 100644
--- a/intern/cycles/util/util_aligned_malloc.cpp
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -55,7 +55,7 @@ void *util_aligned_malloc(size_t size, int alignment)
 	return malloc(size);
 #elif defined(__FreeBSD__) || defined(__NetBSD__)
 	void *result;
-	if (posix_memalign(&result, alignment, size)) {
+	if(posix_memalign(&result, alignment, size)) {
 		/* Non-zero means allocation error
 		 * either no allocation or bad alignment value.
 		 */
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 7bbd97b8667..1d1e2963348 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -17,17 +17,49 @@
 #ifndef __UTIL_ATOMIC_H__
 #define __UTIL_ATOMIC_H__
 
+#ifndef __KERNEL_GPU__
+
 /* Using atomic ops header from Blender. */
 #include "atomic_ops.h"
 
 ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 {
 	size_t prev_value = *maximum_value;
-	while (prev_value < value) {
-		if (atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
+	while(prev_value < value) {
+		if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
 			break;
 		}
 	}
 }
 
+#else  /* __KERNEL_GPU__ */
+
+#ifdef __KERNEL_OPENCL__
+
+/* Float atomics implementation credits:
+ *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
+ */
+ccl_device_inline void atomic_add_float(volatile ccl_global float *source,
+                                        const float operand)
+{
+	union {
+		unsigned int int_value;
+		float float_value;
+	} new_value;
+	union {
+		unsigned int int_value;
+		float float_value;
+	} prev_value;
+	do {
+		prev_value.float_value = *source;
+		new_value.float_value = prev_value.float_value + operand;
+	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
+	                       prev_value.int_value,
+	                       new_value.int_value) != prev_value.int_value);
+}
+
+#endif  /* __KERNEL_OPENCL__ */
+
+#endif  /* __KERNEL_GPU__ */
+
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_cache.h b/intern/cycles/util/util_cache.h
index 9d001a6f1ff..343fa36817d 100644
--- a/intern/cycles/util/util_cache.h
+++ b/intern/cycles/util/util_cache.h
@@ -105,7 +105,7 @@ public:
 			return false;
 		}
 
-		if(!size)
+		if((size == 0) || (size % sizeof(T)) != 0)
 			return false;
 
 		data.resize(size/sizeof(T));
diff --git a/intern/cycles/util/util_foreach.h b/intern/cycles/util/util_foreach.h
index 065bd33ebd2..4f7337107b3 100644
--- a/intern/cycles/util/util_foreach.h
+++ b/intern/cycles/util/util_foreach.h
@@ -19,8 +19,12 @@
 
 /* Use Boost to get nice foreach() loops for STL data structures. */
 
-#include <boost/foreach.hpp>
-#define foreach BOOST_FOREACH
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  define foreach(x, y) for(x : y)
+#else
+#  include <boost/foreach.hpp>
+#  define foreach BOOST_FOREACH
+#endif
 
 #endif /* __UTIL_FOREACH_H__ */
 
diff --git a/intern/cycles/util/util_function.h b/intern/cycles/util/util_function.h
index 7a312efaad7..6d0f0b444a9 100644
--- a/intern/cycles/util/util_function.h
+++ b/intern/cycles/util/util_function.h
@@ -17,14 +17,33 @@
 #ifndef __UTIL_FUNCTION_H__
 #define __UTIL_FUNCTION_H__
 
-#include <boost/bind.hpp>
-#include <boost/function.hpp>
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <functional>
+#else
+#  include <boost/bind.hpp>
+#  include <boost/function.hpp>
+#endif
 
 CCL_NAMESPACE_BEGIN
 
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  define function_bind std::bind
+#  define function_null nullptr
+using std::function;
+using std::placeholders::_1;
+using std::placeholders::_2;
+using std::placeholders::_3;
+using std::placeholders::_4;
+using std::placeholders::_5;
+using std::placeholders::_6;
+using std::placeholders::_7;
+using std::placeholders::_8;
+using std::placeholders::_9;
+#else
 using boost::function;
-#define function_bind boost::bind
-
+#  define function_bind boost::bind
+#  define function_null NULL
+#endif
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_FUNCTION_H__ */
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 263199417c4..2df717253e3 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -42,7 +42,7 @@ void util_guarded_mem_free(size_t n);
 
 /* Guarded allocator for the use with STL. */
 template <typename T>
-class GuardedAllocator: public std::allocator<T> {
+class GuardedAllocator : public std::allocator<T> {
 public:
 	template<typename _Tp1>
 	struct rebind {
@@ -53,7 +53,8 @@ public:
 	{
 		util_guarded_mem_alloc(n * sizeof(T));
 #ifdef WITH_BLENDER_GUARDEDALLOC
-		return (T*)MEM_mallocN(n * sizeof(T), "Cycles Alloc");
+		(void)hint;
+		return (T*)MEM_mallocN_aligned(n * sizeof(T), 16, "Cycles Alloc");
 #else
 		return std::allocator<T>::allocate(n, hint);
 #endif
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 76941569bd2..f4bac9888a5 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -56,7 +56,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 		 * assumes no negative, no nan, no inf, and sets denormal to 0 */
 		union { uint i; float f; } in;
 		float fscale = f[i] * scale;
-		in.f = (fscale > 0.0f)? ((fscale < 65500.0f)? fscale: 65500.0f): 0.0f;
+		in.f = (fscale > 0.0f)? ((fscale < 65504.0f)? fscale: 65504.0f): 0.0f;
 		int x = in.i;
 
 		int absolute = x & 0x7FFFFFFF;
@@ -68,20 +68,20 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 	}
 #else
 	/* same as above with SSE */
-	const ssef mm_scale = ssef(scale);
-	const ssei mm_38800000 = ssei(0x38800000);
-	const ssei mm_7FFF = ssei(0x7FFF);
-	const ssei mm_7FFFFFFF = ssei(0x7FFFFFFF);
-	const ssei mm_C8000000 = ssei(0xC8000000);
-
-	ssef mm_fscale = load4f(f) * mm_scale;
-	ssei x = cast(min(max(mm_fscale, ssef(0.0f)), ssef(65500.0f)));
-	ssei absolute = x & mm_7FFFFFFF;
-	ssei Z = absolute + mm_C8000000;
-	ssei result = andnot(absolute < mm_38800000, Z); 
-	ssei rh = (result >> 13) & mm_7FFF;
-
-	_mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh)));
+	ssef fscale = load4f(f) * scale;
+	ssef x = min(max(fscale, 0.0f), 65504.0f);
+
+#ifdef __KERNEL_AVX2__
+	ssei rpack = _mm_cvtps_ph(x, 0);
+#else
+	ssei absolute = cast(x) & 0x7FFFFFFF;
+	ssei Z = absolute + 0xC8000000;
+	ssei result = andnot(absolute < 0x38800000, Z);
+	ssei rshift = (result >> 13) & 0x7FFF;
+	ssei rpack = _mm_packs_epi32(rshift, rshift);
+#endif
+
+	_mm_storel_pi((__m64*)h, _mm_castsi128_ps(rpack));
 #endif
 }
 
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index bbbedc22a47..3ff2802b46d 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -53,7 +53,7 @@ static inline uint hash_string(const char *str)
 {
 	uint i = 0, c;
 
-	while ((c = *str++))
+	while((c = *str++))
 		i = i * 37 + c;
 
 	return i;
diff --git a/intern/cycles/util/util_map.h b/intern/cycles/util/util_map.h
index 54d6a8d6424..46c2885f8b0 100644
--- a/intern/cycles/util/util_map.h
+++ b/intern/cycles/util/util_map.h
@@ -18,13 +18,38 @@
 #define __UTIL_MAP_H__
 
 #include <map>
-#include <boost/tr1/unordered_map.hpp>
+
+#if defined(CYCLES_TR1_UNORDERED_MAP)
+#  include <tr1/unordered_map>
+#endif
+
+#if defined(CYCLES_STD_UNORDERED_MAP) || defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
+#  include <unordered_map>
+#endif
+
+#if !defined(CYCLES_NO_UNORDERED_MAP) && !defined(CYCLES_TR1_UNORDERED_MAP) && \
+	!defined(CYCLES_STD_UNORDERED_MAP) && !defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)  // NOLINT
+#  error One of: CYCLES_NO_UNORDERED_MAP, CYCLES_TR1_UNORDERED_MAP,\
+ CYCLES_STD_UNORDERED_MAP, CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE must be defined!  // NOLINT
+#endif
+
 
 CCL_NAMESPACE_BEGIN
 
 using std::map;
 using std::pair;
+
+#if defined(CYCLES_NO_UNORDERED_MAP)
+typedef std::map unordered_map;
+#endif
+
+#if defined(CYCLES_TR1_UNORDERED_MAP) || defined(CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
 using std::tr1::unordered_map;
+#endif
+
+#if defined(CYCLES_STD_UNORDERED_MAP)
+using std::unordered_map;
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index a4d49681a38..2262f8fdbb7 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -175,6 +175,15 @@ ccl_device_inline float clamp(float a, float mn, float mx)
 
 #endif
 
+#ifndef __KERNEL_CUDA__
+
+ccl_device_inline float saturate(float a)
+{
+	return clamp(a, 0.0f, 1.0f);
+}
+
+#endif
+
 ccl_device_inline int float_to_int(float f)
 {
 	return (int)f;
@@ -1461,9 +1470,9 @@ ccl_device_inline float2 map_to_tube(const float3 co)
 {
 	float len, u, v;
 	len = sqrtf(co.x * co.x + co.y * co.y);
-	if (len > 0.0f) {
+	if(len > 0.0f) {
 		u = (1.0f - (atan2f(co.x / len, co.y / len) / M_PI_F)) * 0.5f;
-		v = (co.x + 1.0f) * 0.5f;
+		v = (co.z + 1.0f) * 0.5f;
 	}
 	else {
 		u = v = 0.0f;
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index 4ad81e9c015..c1a1be603f4 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -360,7 +360,7 @@ ccl_device float fast_log2f(float x)
 {
 	/* NOTE: clamp to avoid special cases and make result "safe" from large
 	 * negative values/nans. */
-	clamp(x, FLT_MIN, FLT_MAX);
+	x = clamp(x, FLT_MIN, FLT_MAX);
 	unsigned bits = __float_as_uint(x);
 	int exponent = (int)(bits >> 23) - 127;
 	float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
@@ -402,7 +402,7 @@ ccl_device float fast_logb(float x)
 {
 	/* Don't bother with denormals. */
 	x = fabsf(x);
-	clamp(x, FLT_MIN, FLT_MAX);
+	x = clamp(x, FLT_MIN, FLT_MAX);
 	unsigned bits = __float_as_uint(x);
 	return (int)(bits >> 23) - 127;
 }
@@ -410,7 +410,7 @@ ccl_device float fast_logb(float x)
 ccl_device float fast_exp2f(float x)
 {
 	/* Clamp to safe range for final addition. */
-	clamp(x, -126.0f, 126.0f);
+	x = clamp(x, -126.0f, 126.0f);
 	/* Range reduction. */
 	int m = (int)x; x -= m;
 	x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
@@ -539,7 +539,7 @@ ccl_device float fast_safe_powf(float x, float y)
  * bsdf_microfaset.h.
  */
 
-ccl_device float fast_erff(float x)
+ccl_device_inline float fast_erff(float x)
 {
 	/* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
 	/* Abramowitz and Stegun, 7.1.28. */
@@ -570,7 +570,7 @@ ccl_device_inline float fast_erfcf(float x)
 	return 1.0f - fast_erff(x);
 }
 
-ccl_device float fast_ierff(float x)
+ccl_device_inline float fast_ierff(float x)
 {
 	/* From: Approximating the erfinv function by Mike Giles. */
 	/* To avoid trouble at the limit, clamp input to 1-eps. */
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index b230bb1a627..c951c35fc76 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -102,28 +102,7 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #else
-
-#ifdef __KERNEL_SSE2__
-#include <xmmintrin.h> /* SSE 1 */
-#include <emmintrin.h> /* SSE 2 */
-#endif
-
-#ifdef __KERNEL_SSE3__
-#include <pmmintrin.h> /* SSE 3 */
-#endif
-
-#ifdef __KERNEL_SSSE3__
-#include <tmmintrin.h> /* SSSE 3 */
-#endif
-
-#ifdef __KERNEL_SSE41__
-#include <smmintrin.h> /* SSE 4.1 */
-#endif
-
-#ifdef __KERNEL_AVX__
-#include <immintrin.h> /* AVX */
-#endif
-
+#include <x86intrin.h>
 #endif
 
 #else
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 0ff48630a81..0b35142ddb3 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -44,12 +44,12 @@ public:
 		substatus = "";
 		sync_status = "";
 		sync_substatus = "";
-		update_cb = NULL;
+		update_cb = function_null;
 		cancel = false;
 		cancel_message = "";
 		error = false;
 		error_message = "";
-		cancel_cb = NULL;
+		cancel_cb = function_null;
 	}
 
 	Progress(Progress& progress)
@@ -110,7 +110,7 @@ public:
 		return cancel_message;
 	}
 
-	void set_cancel_callback(boost::function<void(void)> function)
+	void set_cancel_callback(function<void(void)> function)
 	{
 		cancel_cb = function;
 	}
@@ -173,6 +173,12 @@ public:
 		tile_time_ = tile_time;
 	}
 
+	void get_time(double& total_time_, double& render_time_)
+	{
+		total_time_ = (total_time > 0.0)? total_time: 0.0;
+		render_time_ = (render_time > 0.0)? render_time: 0.0;
+	}
+
 	void reset_sample()
 	{
 		thread_scoped_lock lock(progress_mutex);
@@ -275,7 +281,7 @@ public:
 		}
 	}
 
-	void set_update_callback(boost::function<void(void)> function)
+	void set_update_callback(function<void(void)> function)
 	{
 		update_cb = function;
 	}
@@ -283,8 +289,8 @@ public:
 protected:
 	thread_mutex progress_mutex;
 	thread_mutex update_mutex;
-	boost::function<void(void)> update_cb;
-	boost::function<void(void)> cancel_cb;
+	function<void(void)> update_cb;
+	function<void(void)> cancel_cb;
 
 	int tile;    /* counter for rendered tiles */
 	int sample;  /* counter of rendered samples, global for all tiles */
diff --git a/intern/cycles/util/util_set.h b/intern/cycles/util/util_set.h
index 73a2bf19899..b3cb8dd8af5 100644
--- a/intern/cycles/util/util_set.h
+++ b/intern/cycles/util/util_set.h
@@ -18,13 +18,19 @@
 #define __UTIL_SET_H__
 
 #include <set>
-#include <boost/tr1/unordered_set.hpp>
-
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <unordered_set>
+#else
+#  include <boost/tr1/unordered_set.hpp>
+#endif
 CCL_NAMESPACE_BEGIN
 
 using std::set;
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+using std::unordered_set;
+#else
 using std::tr1::unordered_set;
-
+#endif
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SET_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 625f26c1316..7c15199d4e1 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -133,7 +133,7 @@ __forceinline int clz(const int x)
 #if defined(__KERNEL_AVX2__)
   return _lzcnt_u32(x);
 #else
-  if (UNLIKELY(x == 0)) return 32;
+  if(UNLIKELY(x == 0)) return 32;
   return 31 - __bsr(x);    
 #endif
 }
@@ -286,7 +286,7 @@ __forceinline int clz(const int x)
 #if defined(__KERNEL_AVX2__)
   return _lzcnt_u32(x);
 #else
-  if (UNLIKELY(x == 0)) return 32;
+  if(UNLIKELY(x == 0)) return 32;
   return 31 - __bsr(x);    
 #endif
 }
@@ -358,7 +358,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
   char* _r = (char*)(&rvalue + 1);
   char* _v = (char*)(& value + 1);
   char* _i = (char*)(& input + 1);
-  for ( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
+  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
   return rvalue;
 }
 
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 8675ff3679d..66856dd8331 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -105,5 +105,22 @@ string string_strip(const string& s)
 
 }
 
+void string_replace(string& haystack, const string& needle, const string& other)
+{
+	size_t i;
+
+	while((i = haystack.find(needle)) != string::npos)
+		haystack.replace(i, needle.length(), other);
+}
+
+string string_remove_trademark(const string &s)
+{
+	string result = s;
+	string_replace(result, "(TM)", "");
+	string_replace(result, "(R)", "");
+
+	return string_strip(result);
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index fa1671fd1ee..6cb8d8df1e1 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -40,8 +40,10 @@ string string_printf(const char *format, ...) PRINTF_ATTRIBUTE;
 
 bool string_iequals(const string& a, const string& b);
 void string_split(vector<string>& tokens, const string& str, const string& separators = "\t ");
+void string_replace(string& haystack, const string& needle, const string& other);
 bool string_endswith(const string& s, const char *end);
 string string_strip(const string& s);
+string string_remove_trademark(const string& s);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 7206455debd..cc88320b68e 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -16,6 +16,7 @@
 
 #include "util_system.h"
 #include "util_types.h"
+#include "util_string.h"
 
 #ifdef _WIN32
 #if(!defined(FREE_WINDOWS))
@@ -75,14 +76,6 @@ static void __cpuid(int data[4], int selector)
 }
 #endif
 
-static void replace_string(string& haystack, const string& needle, const string& other)
-{
-	size_t i;
-
-	while((i = haystack.find(needle)) != string::npos)
-		haystack.replace(i, needle.length(), other);
-}
-
 string system_cpu_brand_string()
 {
 	char buf[48];
@@ -98,10 +91,7 @@ string system_cpu_brand_string()
 		string brand = buf;
 
 		/* make it a bit more presentable */
-		replace_string(brand, "(TM)", "");
-		replace_string(brand, "(R)", "");
-
-		brand = string_strip(brand);
+		brand = string_remove_trademark(brand);
 
 		return brand;
 	}
@@ -127,6 +117,7 @@ struct CPUCapabilities {
 	bool sse42;
 	bool sse4a;
 	bool avx;
+	bool f16c;
 	bool avx2;
 	bool xop;
 	bool fma3;
@@ -202,6 +193,8 @@ static CPUCapabilities& system_cpu_capabilities()
 				caps.avx = (xcr_feature_mask & 0x6) == 0x6;
 			}
 
+			caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+
 			__cpuid(result, 0x00000007);
 			caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
 			caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
@@ -242,7 +235,7 @@ bool system_cpu_support_avx()
 bool system_cpu_support_avx2()
 {
 	CPUCapabilities& caps = system_cpu_capabilities();
-	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
 }
 #else
 
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index e43b26ddfe2..d56553d1d4a 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -237,7 +237,7 @@ bool TaskScheduler::thread_wait_pop(Entry& entry)
 	return true;
 }
 
-void TaskScheduler::thread_run(int thread_id)
+void TaskScheduler::thread_run(int /*thread_id*/)
 {
 	Entry entry;
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index 8fac12e9987..debcff3b776 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -27,7 +27,7 @@ class Task;
 class TaskPool;
 class TaskScheduler;
 
-typedef boost::function<void(void)> TaskRunFunction;
+typedef function<void(void)> TaskRunFunction;
 
 /* Task
  *
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index fbbb9b42e31..9c19235d41d 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -17,7 +17,14 @@
 #ifndef __UTIL_THREAD_H__
 #define __UTIL_THREAD_H__
 
-#include <boost/thread.hpp>
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#  include <thread>
+#  include <mutex>
+#  include <condition_variable>
+#  include <functional>
+#else
+#  include <boost/thread.hpp>
+#endif
 #include <pthread.h>
 #include <queue>
 
@@ -25,18 +32,24 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+typedef std::mutex thread_mutex;
+typedef std::unique_lock<std::mutex> thread_scoped_lock;
+typedef std::condition_variable thread_condition_variable;
+#else
 /* use boost for mutexes */
-
 typedef boost::mutex thread_mutex;
 typedef boost::mutex::scoped_lock thread_scoped_lock;
 typedef boost::condition_variable thread_condition_variable;
+#endif
 
 /* own pthread based implementation, to avoid boost version conflicts with
  * dynamically loaded blender plugins */
 
 class thread {
 public:
-	thread(boost::function<void(void)> run_cb_)
+	thread(function<void(void)> run_cb_)
+
 	{
 		joined = false;
 		run_cb = run_cb_;
@@ -63,7 +76,7 @@ public:
 	}
 
 protected:
-	boost::function<void(void)> run_cb;
+	function<void(void)> run_cb;
 	pthread_t pthread_id;
 	bool joined;
 };
diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp
index 9668b0f9882..964f9f1a7af 100644
--- a/intern/cycles/util/util_time.cpp
+++ b/intern/cycles/util/util_time.cpp
@@ -71,7 +71,7 @@ void time_sleep(double t)
 
 	/* get microseconds */
 	int us = (int)(t * 1e6);
-	if (us > 0)
+	if(us > 0)
 		usleep(us);
 }
 
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 0a1c09ae3d5..acaca69464c 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,9 +46,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_math.h"
 #include "util_transform.h"
 
+#include "util_boundbox.h"
+#include "util_math.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Transform Inverse */
@@ -271,5 +273,15 @@ void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTrans
 	decomp->post_y = post.y;
 }
 
-CCL_NAMESPACE_END
+Transform transform_from_viewplane(BoundBox2D& viewplane)
+{
+	return
+		transform_scale(1.0f / (viewplane.right - viewplane.left),
+		                1.0f / (viewplane.top - viewplane.bottom),
+		                1.0f) *
+		transform_translate(-viewplane.left,
+		                    -viewplane.bottom,
+		                    0.0f);
+}
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index ac97fa53084..0b87db0a379 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -449,6 +449,8 @@ ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionT
 
 #ifndef __KERNEL_GPU__
 
+class BoundBox2D;
+
 ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransform& B)
 {
 	return (A.pre == B.pre && A.post == B.post);
@@ -456,7 +458,39 @@ ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransfor
 
 float4 transform_to_quat(const Transform& tfm);
 void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid);
+Transform transform_from_viewplane(BoundBox2D& viewplane);
+
+#endif
+
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
 
+#ifdef __KERNEL_OPENCL__
+
+#define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a ## b
+#define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
+ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
+    ccl_addr_space const Transform *t, const float3 a) \
+{ \
+  Transform private_tfm = *t; \
+  return function(&private_tfm, a); \
+}
+
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
+OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
+
+#  undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
+#  undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
+#  define transform_point_auto transform_point_addrspace
+#  define transform_direction_auto transform_direction_addrspace
+#  define transform_direction_transposed_auto transform_direction_transposed_addrspace
+#else
+#  define transform_point_auto transform_point
+#  define transform_direction_auto transform_direction
+#  define transform_direction_transposed_auto transform_direction_transposed
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 92c3f116c69..15a65be0ef0 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -19,7 +19,8 @@
 
 /* Vector */
 
-#include <string.h>
+#include <cassert>
+#include <cstring>
 #include <vector>
 
 #include "util_aligned_malloc.h"
@@ -40,20 +41,16 @@ CCL_NAMESPACE_BEGIN
  *
  * - Have method to ensure capacity is re-set to 0.
  */
-template<typename value_type>
-class vector : public std::vector<value_type
+template<typename value_type,
 #ifdef WITH_CYCLES_DEBUG
-                                  , GuardedAllocator<value_type>
+         typename allocator_type = GuardedAllocator<value_type>
+#else
+         typename allocator_type = std::allocator<value_type>
 #endif
-                                  >
+        >
+class vector : public std::vector<value_type, allocator_type>
 {
 public:
-#ifdef WITH_CYCLES_DEBUG
-	typedef GuardedAllocator<value_type> allocator_type;
-#else
-	typedef std::allocator<value_type> allocator_type;
-#endif
-
 	/* Default constructor. */
 	explicit vector() : std::vector<value_type, allocator_type>() {  }
 
@@ -63,7 +60,7 @@ public:
 
 	/* Range constructor. */
 	template <class InputIterator>
-	vector (InputIterator first, InputIterator last)
+	vector(InputIterator first, InputIterator last)
 		: std::vector<value_type, allocator_type>(first, last) {  }
 
 	/* Copy constructor. */
@@ -78,7 +75,8 @@ public:
 #endif
 	}
 
-	void free_memory(void) {
+	void free_memory(void)
+	{
 		std::vector<value_type, allocator_type>::resize(0);
 		shrink_to_fit();
 	}
@@ -190,6 +188,7 @@ public:
 
 	T& operator[](size_t i) const
 	{
+		assert(i < datasize);
 		return data[i];
 	}
 
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index e77ebb2fe3e..9b5cd22fb4a 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -98,7 +98,7 @@ void view_display_help()
 	glColor3f(0.8f, 0.8f, 0.8f);
 
 	view_display_text(x1+20, y2-20, "Cycles Renderer");
-	view_display_text(x1+20, y2-40, "(C) 2011-2014 Blender Foundation");
+	view_display_text(x1+20, y2-40, "(C) 2011-2015 Blender Foundation");
 	view_display_text(x1+20, y2-80, "Controls:");
 	view_display_text(x1+20, y2-100, "h:  Info/Help");
 	view_display_text(x1+20, y2-120, "r:  Reset");
@@ -110,6 +110,7 @@ void view_display_help()
 	view_display_text(x1+20, y2-230, "Left mouse:  Move camera");
 	view_display_text(x1+20, y2-250, "Right mouse:  Rotate camera");
 	view_display_text(x1+20, y2-270, "W/A/S/D:  Move camera");
+	view_display_text(x1+20, y2-290, "0/1/2/3:  Set max bounces");
 
 	glColor3f(1.0f, 1.0f, 1.0f);
 }