78 files changed, 1129 insertions, 864 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index fb747c1313e..c8c71fe6856 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -14,7 +14,9 @@ include(cmake/external_libs.cmake)
 # todo: refactor this code to match scons
 # note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
 
-if(WIN32 AND MSVC)
+if(NOT WITH_CPU_SSE)
+	set(CXX_HAS_SSE FALSE)
+elseif(WIN32 AND MSVC)
 	set(CXX_HAS_SSE TRUE)
 
 	# /arch:AVX for VC2012 and above
@@ -161,6 +163,10 @@ include_directories(
 	${OPENEXR_INCLUDE_DIRS}
 )
 
+# TODO(sergey): Adjust so standalone repository is also happy.
+include_directories(
+	../atomic
+)
 
 # Warnings
 if(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index b399844534d..15a02881ec2 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -62,12 +62,23 @@ if env['WITH_BF_CYCLES_OSL']:
 if env['WITH_BF_CYCLES_DEBUG']:
     defs.append('WITH_CYCLES_DEBUG')
 
+if env['WITH_BF_CYCLES_LOGGING']:
+    defs.append('WITH_CYCLES_LOGGING')
+    defs.append('GOOGLE_GLOG_DLL_DECL=')
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+        incs.append('#extern/libmv/third_party/glog/src/windows')
+        incs.append('#extern/libmv/third_party/gflags')
+    else:
+        incs.append('#extern/libmv/third_party/glog/src')
+        incs.append('#extern/libmv/third_party/gflags')
+
 incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
 incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna #source/blender/blenlib'.split())
 incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
 
 incs.append(env['BF_GLEW_INC'])
 incs.append('#/intern/glew-mx')
+incs.append('#/intern/atomic')
 incs.append('#intern/mikktspace')
 incs.extend('#extern/glew/include #extern/clew/include #extern/cuew/include #intern/mikktspace'.split())
 
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 431796e106b..528b3016b80 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -299,7 +299,6 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
 	xml_read_bool(&integrator->transparent_shadows, node, "transparent_shadows");
 	
 	/* Volume */
-	xml_read_int(&integrator->volume_homogeneous_sampling, node, "volume_homogeneous_sampling");
 	xml_read_float(&integrator->volume_step_size, node, "volume_step_size");
 	xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps");
 	
@@ -803,7 +802,17 @@ static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
 	xml_read_string(&shader->name, node, "name");
 	xml_read_bool(&shader->use_mis, node, "use_mis");
 	xml_read_bool(&shader->use_transparent_shadow, node, "use_transparent_shadow");
+
+	/* Volume */
 	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
+	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
+
+	if(xml_equal_string(node, "volume_sampling_method", "distance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
+	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
 
 	xml_read_shader_graph(state, shader, node);
 	state.scene->shaders.push_back(shader);
@@ -816,6 +825,14 @@ static void xml_read_background(const XMLReadState& state, pugi::xml_node node)
 	Shader *shader = state.scene->shaders[state.scene->default_background];
 	
 	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
+	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
+
+	if(xml_equal_string(node, "volume_sampling_method", "distance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
+	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
+		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
 
 	xml_read_shader_graph(state, shader, node);
 }
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 597ac1a9ce0..9459b750bd1 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -114,6 +114,11 @@ enum_volume_sampling = (
     ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling for volumes where neither method is ideal"),
     )
 
+enum_volume_interpolation = (
+    ('LINEAR', "Linear", "Good smoothness and speed"),
+    ('CUBIC', 'Cubic', 'Smoothed high quality interpolation, but slower')
+    )
+
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -345,7 +350,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Distance between volume shader samples when rendering the volume "
                             "(lower values give more accurate and detailed results, but also increased render time)",
                 default=0.1,
-                min=0.0000001, max=100000.0
+                min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0
                 )
 
         cls.volume_max_steps = IntProperty(
@@ -617,6 +622,13 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
                 default='DISTANCE',
                 )
 
+        cls.volume_interpolation = EnumProperty(
+                name="Volume Interpolation",
+                description="Interpolation method to use for volumes",
+                items=enum_volume_interpolation,
+                default='LINEAR',
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Material.cycles
@@ -641,6 +653,12 @@ class CyclesLampSettings(bpy.types.PropertyGroup):
                 min=1, max=10000,
                 default=1,
                 )
+        cls.max_bounces = IntProperty(
+                name="Max Bounces",
+                description="Maximum number of bounces the light will contribute to the render",
+                min=0, max=1024,
+                default=1024,
+                )
         cls.use_multiple_importance_sampling = BoolProperty(
                 name="Multiple Importance Sample",
                 description="Use multiple importance sampling for the lamp, "
@@ -693,6 +711,13 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
                 default='EQUIANGULAR',
                 )
 
+        cls.volume_interpolation = EnumProperty(
+                name="Volume Interpolation",
+                description="Interpolation method to use for volumes",
+                items=enum_volume_interpolation,
+                default='LINEAR',
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.World.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 9632b12c414..9b1e20d3c8f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -729,11 +729,11 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
 
         if cscene.progressive == 'BRANCHED_PATH':
             col.prop(clamp, "samples")
+        col.prop(clamp, "max_bounces")
 
         col = split.column()
         col.prop(clamp, "cast_shadow")
-
-        layout.prop(clamp, "use_multiple_importance_sampling")
+        col.prop(clamp, "use_multiple_importance_sampling", text="Multiple Importance")
 
         if lamp.type == 'HEMI':
             layout.label(text="Not supported, interpreted as sun lamp")
@@ -936,6 +936,7 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cworld, "volume_sampling", text="")
+        sub.prop(cworld, "volume_interpolation", text="")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
 
 
@@ -1019,17 +1020,6 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         cmat = mat.cycles
 
         split = layout.split()
-
-        col = split.column(align=True)
-        col.prop(mat, "diffuse_color", text="Viewport Color")
-        col.prop(mat, "alpha")
-
-        col = split.column(align=True)
-        col.label()
-        col.prop(mat, "pass_index")
-
-        split = layout.split()
-
         col = split.column()
         col.label(text="Surface:")
         col.prop(cmat, "sample_as_light", text="Multiple Importance")
@@ -1040,8 +1030,25 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         sub = col.column()
         sub.active = use_cpu(context)
         sub.prop(cmat, "volume_sampling", text="")
+        col.prop(cmat, "volume_interpolation", text="")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
 
+        layout.separator()
+        split = layout.split()
+
+        col = split.column(align=True)
+        col.label("Viewport Color:")
+        col.prop(mat, "diffuse_color", text="")
+        col.prop(mat, "alpha")
+
+        col.separator()
+        col.prop(mat, "pass_index")
+
+        col = split.column(align=True)
+        col.label("Viewport Specular:")
+        col.prop(mat, "specular_color", text="")
+        col.prop(mat, "specular_hardness", text="Hardness")
+
 
 class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
     bl_label = ""
@@ -1381,7 +1388,11 @@ def get_panels():
         "RENDER_PT_encoding",
         "RENDER_PT_dimensions",
         "RENDER_PT_stamp",
+        "RENDER_PT_freestyle",
         "RENDERLAYER_PT_layers",
+        "RENDERLAYER_PT_freestyle",
+        "RENDERLAYER_PT_freestyle_lineset",
+        "RENDERLAYER_PT_freestyle_linestyle",
         "SCENE_PT_scene",
         "SCENE_PT_color_management",
         "SCENE_PT_custom_props",
@@ -1419,6 +1430,7 @@ def get_panels():
         "DATA_PT_custom_props_curve",
         "DATA_PT_custom_props_lattice",
         "DATA_PT_custom_props_metaball",
+        "TEXTURE_PT_preview",
         "TEXTURE_PT_custom_props",
         "TEXTURE_PT_clouds",
         "TEXTURE_PT_wood",
@@ -1436,6 +1448,7 @@ def get_panels():
         "TEXTURE_PT_pointdensity",
         "TEXTURE_PT_pointdensity_turbulence",
         "TEXTURE_PT_mapping",
+        "TEXTURE_PT_ocean",
         "TEXTURE_PT_influence",
         "TEXTURE_PT_colors",
         "PARTICLE_PT_context_particles",
@@ -1457,6 +1470,7 @@ def get_panels():
         "PARTICLE_PT_force_fields",
         "PARTICLE_PT_vertexgroups",
         "MATERIAL_PT_custom_props",
+        "MATERIAL_PT_freestyle_line",
         "BONE_PT_custom_props",
         "OBJECT_PT_custom_props",
         ]
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index ce8c64c4819..416348f3b91 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -20,6 +20,8 @@
 #include "blender_sync.h"
 #include "blender_util.h"
 
+#include "util_logging.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Blender Camera Intermediate: we first convert both the offline and 3d view
@@ -400,6 +402,7 @@ void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
 	tfm = blender_camera_matrix(tfm, cam->type);
 
 	if(tfm != cam->matrix) {
+		VLOG(1) << "Camera " << b_ob.name() << " motion detected.";
 		if(motion_time == -1.0f) {
 			cam->motion.pre = tfm;
 			cam->use_motion = true;
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 8cfaea59a06..7c8e7d40119 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -25,6 +25,7 @@
 #include "blender_util.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -577,6 +578,10 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		}
 	}
 
+	if (num_curves > 0) {
+		VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
+	}
+
 	mesh->curve_keys.reserve(mesh->curve_keys.size() + num_keys);
 	mesh->curves.reserve(mesh->curves.size() + num_curves);
 
@@ -612,9 +617,9 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		}
 	}
 
-	/* check allocation*/
+	/* check allocation */
 	if((mesh->curve_keys.size() !=  num_keys) || (mesh->curves.size() !=  num_curves)) {
-		/* allocation failed -> clear data */
+		VLOG(1) << "Allocation failed, clearing data";
 		mesh->curve_keys.clear();
 		mesh->curves.clear();
 		mesh->curve_attributes.clear();
@@ -623,12 +628,16 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 
 static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveData *CData, int time_index)
 {
+	VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name
+	        << ", time index " << time_index;
+
 	/* find attribute */
 	Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 	bool new_attribute = false;
 
 	/* add new attribute if it doesn't exist already */
 	if(!attr_mP) {
+		VLOG(1) << "Creating new motion vertex position attribute";
 		attr_mP = mesh->curve_attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
 		new_attribute = true;
 	}
@@ -675,9 +684,12 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 	if(new_attribute) {
 		if(i != numkeys || !have_motion) {
 			/* no motion, remove attributes again */
+			VLOG(1) << "No motion, removing attribute";
 			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 		}
 		else if(time_index > 0) {
+			VLOG(1) << "Filling in new motion vertex position for time_index"
+			        << time_index;
 			/* motion, fill up previous steps that we might have skipped because
 			 * they had no motion, but we need them anyway now */
 			for(int step = 0; step < time_index; step++) {
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index a5e4b7bd2ae..e8da8a87c1d 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -27,6 +27,7 @@
 #include "subd_split.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 #include "mikktspace.h"
 
@@ -761,11 +762,13 @@ void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion
 		if(new_attribute) {
 			if(i != numverts || memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0) {
 				/* no motion, remove attributes again */
+				VLOG(1) << "No actual motion for mesh " << b_mesh.name();
 				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 				if(attr_mN)
 					mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL);
 			}
 			else if(time_index > 0) {
+				VLOG(1) << "Filling motion for mesh " << b_mesh.name();
 				/* motion, fill up previous steps that we might have skipped because
 				 * they had no motion, but we need them anyway now */
 				float3 *P = &mesh->verts[0];
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 1e07c5f9c96..88bfbf6db74 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -30,6 +30,7 @@
 
 #include "util_foreach.h"
 #include "util_hash.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -168,6 +169,8 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
 	else
 		light->samples = samples;
 
+	light->max_bounces = get_int(clamp, "max_bounces");
+
 	/* visibility */
 	uint visibility = object_ray_visibility(b_ob);
 	light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
@@ -249,6 +252,7 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 		if(object && (scene->need_motion() == Scene::MOTION_PASS || object_use_motion(b_ob))) {
 			/* object transformation */
 			if(tfm != object->tfm) {
+				VLOG(1) << "Object " << b_ob.name() << " motion detected.";
 				if(motion_time == -1.0f) {
 					object->motion.pre = tfm;
 					object->use_motion = true;
@@ -458,10 +462,10 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 	BL::Scene::object_bases_iterator b_base;
 	BL::Scene b_sce = b_scene;
 	/* modifier result type (not exposed as enum in C++ API)
-     * 1 : DAG_EVAL_PREVIEW
-     * 2 : DAG_EVAL_RENDER
-     */
-    int dupli_settings = preview ? 1 : 2;
+	 * 1 : DAG_EVAL_PREVIEW
+	 * 2 : DAG_EVAL_RENDER
+	 */
+	int dupli_settings = preview ? 1 : 2;
 
 	bool cancel = false;
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index b756d6acdb2..78a96319163 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -53,14 +53,36 @@ void python_thread_state_restore(void **python_thread_state)
 	*python_thread_state = NULL;
 }
 
+static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
+{
+#ifdef WIN32
+	/* bug [#31856] oddly enough, Python3.2 --> 3.3 on Windows will throw an
+	 * exception here this needs to be fixed in python:
+	 * see: bugs.python.org/issue15859 */
+	if(!PyUnicode_Check(py_str)) {
+		PyErr_BadArgument();
+		return "";
+	}
+#endif
+	if((*coerce = PyUnicode_EncodeFSDefault(py_str))) {
+		return PyBytes_AS_STRING(*coerce);
+	}
+	return "";
+}
+
 static PyObject *init_func(PyObject *self, PyObject *args)
 {
-	const char *path, *user_path;
+	PyObject *path, *user_path;
 
-	if(!PyArg_ParseTuple(args, "ss", &path, &user_path))
+	if(!PyArg_ParseTuple(args, "OO", &path, &user_path)) {
 		return NULL;
-	
-	path_init(path, user_path);
+	}
+
+	PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+	path_init(PyC_UnicodeAsByte(path, &path_coerce),
+	          PyC_UnicodeAsByte(user_path, &user_path_coerce));
+	Py_XDECREF(path_coerce);
+	Py_XDECREF(user_path_coerce);
 
 	Py_RETURN_NONE;
 }
@@ -83,7 +105,7 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	BL::UserPreferences userpref(userprefptr);
 
 	PointerRNA dataptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pydata), &dataptr);
+	RNA_main_pointer_create((Main*)PyLong_AsVoidPtr(pydata), &dataptr);
 	BL::BlendData data(dataptr);
 
 	PointerRNA sceneptr;
@@ -91,15 +113,15 @@ static PyObject *create_func(PyObject *self, PyObject *args)
 	BL::Scene scene(sceneptr);
 
 	PointerRNA regionptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyregion), &regionptr);
+	RNA_pointer_create(NULL, &RNA_Region, pylong_as_voidptr_typesafe(pyregion), &regionptr);
 	BL::Region region(regionptr);
 
 	PointerRNA v3dptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
+	RNA_pointer_create(NULL, &RNA_SpaceView3D, pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
 	BL::SpaceView3D v3d(v3dptr);
 
 	PointerRNA rv3dptr;
-	RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
+	RNA_pointer_create(NULL, &RNA_RegionView3D, pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
 	BL::RegionView3D rv3d(rv3dptr);
 
 	/* create session */
@@ -174,7 +196,7 @@ static PyObject *bake_func(PyObject *self, PyObject *args)
 	void *b_result = PyLong_AsVoidPtr(pyresult);
 
 	PointerRNA bakepixelptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
+	RNA_pointer_create(NULL, &RNA_BakePixel, PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
 	BL::BakePixel b_bake_pixel(bakepixelptr);
 
 	python_thread_state_save(&session->python_thread_state);
@@ -216,7 +238,7 @@ static PyObject *reset_func(PyObject *self, PyObject *args)
 	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
 
 	PointerRNA dataptr;
-	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pydata), &dataptr);
+	RNA_main_pointer_create((Main*)PyLong_AsVoidPtr(pydata), &dataptr);
 	BL::BlendData b_data(dataptr);
 
 	PointerRNA sceneptr;
@@ -363,13 +385,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 		/* find socket socket */
 		BL::NodeSocket b_sock(PointerRNA_NULL);
 		if (param->isoutput) {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-			b_sock = b_node.outputs[param->name];
-#else
 			b_sock = b_node.outputs[param->name.string()];
-#endif
-
-			
 			/* remove if type no longer matches */
 			if(b_sock && b_sock.bl_idname() != socket_type) {
 				b_node.outputs.remove(b_sock);
@@ -377,12 +393,7 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 			}
 		}
 		else {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-			b_sock = b_node.inputs[param->name];
-#else
 			b_sock = b_node.inputs[param->name.string()];
-#endif
-			
 			/* remove if type no longer matches */
 			if(b_sock && b_sock.bl_idname() != socket_type) {
 				b_node.inputs.remove(b_sock);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 57ffea4b1a9..79ab25483e2 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -92,6 +92,7 @@ void BlenderSession::create_session()
 
 	/* reset status/progress */
 	last_status = "";
+	last_error = "";
 	last_progress = -1.0f;
 	start_resize_time = 0.0;
 
@@ -826,10 +827,8 @@ void BlenderSession::update_status_progress()
 	get_status(status, substatus);
 	get_progress(progress, total_time);
 
-	
-
 	if(background) {
-		if(progress>0)
+		if(progress > 0)
 			remaining_time = (1.0 - (double)progress) * (total_time / (double)progress);
 
 		scene += " | " + b_scene.name();
@@ -843,12 +842,12 @@ void BlenderSession::update_status_progress()
 		if(samples > 0 && total_samples != USHRT_MAX)
 			remaining_time = (total_samples - samples) * (total_time / samples);
 	}
-	
-	if(remaining_time>0) {
+
+	if(remaining_time > 0) {
 		BLI_timestr(remaining_time, time_str, sizeof(time_str));
 		timestatus += "Remaining:" + string(time_str) + " | ";
 	}
-	
+
 	timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", (double)mem_used, (double)mem_peak);
 
 	if(status.size() > 0)
@@ -865,6 +864,21 @@ void BlenderSession::update_status_progress()
 		b_engine.update_progress(progress);
 		last_progress = progress;
 	}
+
+	if (session->progress.get_error()) {
+		string error = session->progress.get_error_message();
+		if(error != last_error) {
+			/* TODO(sergey): Currently C++ RNA API doesn't let us to
+			 * use mnemonic name for the variable. Would be nice to
+			 * have this figured out.
+			 *
+			 * For until then, 1 << 5 means RPT_ERROR.
+			 */
+			b_engine.report(1 << 5, error.c_str());
+			b_engine.error_set(error.c_str());
+			last_error = error;
+		}
+	}
 }
 
 void BlenderSession::tag_update()
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index ac685118b3d..143a23af5c6 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -91,6 +91,7 @@ public:
 	string b_rlay_name;
 
 	string last_status;
+	string last_error;
 	float last_progress;
 
 	int width, height;
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 33c7bf5f859..27c2e9e9ae8 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -1014,7 +1014,8 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->use_mis = get_boolean(cmat, "sample_as_light");
 			shader->use_transparent_shadow = get_boolean(cmat, "use_transparent_shadow");
 			shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
-			shader->volume_sampling_method = RNA_enum_get(&cmat, "volume_sampling");
+			shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cmat, "volume_sampling");
+			shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cmat, "volume_interpolation");
 
 			shader->set_graph(graph);
 			shader->tag_update(scene);
@@ -1044,7 +1045,8 @@ void BlenderSync::sync_world(bool update_all)
 			/* volume */
 			PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 			shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume");
-			shader->volume_sampling_method = RNA_enum_get(&cworld, "volume_sampling");
+			shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cworld, "volume_sampling");
+			shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cworld, "volume_interpolation");
 		}
 		else if(b_world) {
 			ShaderNode *closure, *out;
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 2ac90b34fd7..ee492dd00e4 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -515,7 +515,17 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
 		params.shadingsystem = SHADINGSYSTEM_OSL;
 	
 	/* color managagement */
-	params.display_buffer_linear = GLEW_ARB_half_float_pixel && b_engine.support_display_space_shader(b_scene);
+#ifdef GLEW_MX
+	/* When using GLEW MX we need to check whether we've got an OpenGL
+	 * context for current window. This is because command line rendering
+	 * doesn't have OpenGL context actually.
+	 */
+	if(glewGetContext() != NULL)
+#endif
+	{
+		params.display_buffer_linear = GLEW_ARB_half_float_pixel &&
+		                               b_engine.support_display_space_shader(b_scene);
+	}
 
 	return params;
 }
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index eb4cca92b6b..5547229a910 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -28,6 +28,7 @@
 
 #include "util_debug.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_progress.h"
 #include "util_time.h"
 
@@ -223,7 +224,8 @@ BVHNode* BVHBuild::run()
 	spatial_right_bounds.resize(max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1);
 
 	/* init progress updates */
-	progress_start_time = time_dt();
+	double build_start_time;
+	build_start_time = progress_start_time = time_dt();
 	progress_count = 0;
 	progress_total = references.size();
 	progress_original_total = progress_total;
@@ -258,6 +260,10 @@ BVHNode* BVHBuild::run()
 		}
 	}
 
+	VLOG(1) << "BVH built in "
+	        << time_dt() - build_start_time
+	        << " seconds.";
+
 	return rootnode;
 }
 
@@ -394,7 +400,7 @@ BVHNode* BVHBuild::build_node(const BVHRange& range, int level)
 	progress_total += left.size() + right.size() - range.size();
 	size_t total = progress_total;
 
-	/* leaft node */
+	/* left node */
 	BVHNode *leftnode = build_node(left, level + 1);
 
 	/* right node (modify start for splits) */
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index e073b69472e..43c2d9b2683 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -115,6 +115,11 @@ public:
 	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
 	__forceinline int prim_type() const { return type; }
 
+	BVHReference& operator=(const BVHReference &arg) {
+		memcpy(this, &arg, sizeof(BVHReference));
+		return *this;
+	}
+
 protected:
 	BoundBox rbounds;
 	uint type;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c9b8a5b726b..e5242e7ee47 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -170,124 +170,42 @@ public:
 #endif
 
 		RenderTile tile;
-		
-		while(task.acquire_tile(this, tile)) {
-			float *render_buffer = (float*)tile.buffer;
-			uint *rng_state = (uint*)tile.rng_state;
-			int start_sample = tile.start_sample;
-			int end_sample = tile.start_sample + tile.num_samples;
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
-													  sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
+		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
 
-					task.update_progress(&tile);
-				}
-			}
-			else
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2())
+			path_trace_kernel = kernel_cpu_avx2_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+		if(system_cpu_support_avx())
+			path_trace_kernel = kernel_cpu_avx_path_trace;
+		else
 #endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41())
+			path_trace_kernel = kernel_cpu_sse41_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+		if(system_cpu_support_sse3())
+			path_trace_kernel = kernel_cpu_sse3_path_trace;
+		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task.get_cancel() || task_pool.canceled()) {
-						if(task.need_finish_queue == false)
-							break;
-					}
-
-					for(int y = tile.y; y < tile.y + tile.h; y++) {
-						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
-								sample, x, y, tile.offset, tile.stride);
-						}
-					}
-
-					tile.sample = sample + 1;
-
-					task.update_progress(&tile);
-				}
-			}
-			else
+		if(system_cpu_support_sse2())
+			path_trace_kernel = kernel_cpu_sse2_path_trace;
+		else
 #endif
-			{
+			path_trace_kernel = kernel_cpu_path_trace;
+		
+		while(task.acquire_tile(this, tile)) {
+			float *render_buffer = (float*)tile.buffer;
+			uint *rng_state = (uint*)tile.rng_state;
+			int start_sample = tile.start_sample;
+			int end_sample = tile.start_sample + tile.num_samples;
+
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
 						if(task.need_finish_queue == false)
@@ -296,7 +214,7 @@ public:
 
 					for(int y = tile.y; y < tile.y + tile.h; y++) {
 						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_path_trace(&kg, render_buffer, rng_state,
+							path_trace_kernel(&kg, render_buffer, rng_state,
 								sample, x, y, tile.offset, tile.stride);
 						}
 					}
@@ -305,7 +223,7 @@ public:
 
 					task.update_progress(&tile);
 				}
-			}
+
 
 			task.release_tile(tile);
 
@@ -325,110 +243,74 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
+			void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-															 sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx2())
+				convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
+			if(system_cpu_support_avx())
 				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
 			else
 #endif	
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse41())
+				convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
 			else
 #endif		
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
-			if(system_cpu_support_sse3()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse3())
+				convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse2())
+				convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
 			else
 #endif
-			{
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
+
+			for(int y = task.y; y < task.y + task.h; y++)
+				for(int x = task.x; x < task.x + task.w; x++)
+					convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+						sample_scale, x, y, task.offset, task.stride);
 		}
 		else {
+			void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-													   sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx2())
+				convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_avx())
+				convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
 			else
 #endif		
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse41())
+				convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
 			else
 #endif			
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse3())
+				convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
 			else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+			if(system_cpu_support_sse2())
+				convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
 			else
 #endif
-			{
-				for(int y = task.y; y < task.y + task.h; y++)
-					for(int x = task.x; x < task.x + task.w; x++)
-						kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-							sample_scale, x, y, task.offset, task.stride);
-			}
+				convert_to_byte_kernel = kernel_cpu_convert_to_byte;
+
+			for(int y = task.y; y < task.y + task.h; y++)
+				for(int x = task.x; x < task.x + task.w; x++)
+					convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+						sample_scale, x, y, task.offset, task.stride);
+
 		}
 	}
 
@@ -439,93 +321,45 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
+		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_avx2())
+			shader_kernel = kernel_cpu_avx2_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_avx())
+			shader_kernel = kernel_cpu_avx_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-		if(system_cpu_support_sse41()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse41())
+			shader_kernel = kernel_cpu_sse41_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse3())
+			shader_kernel = kernel_cpu_sse3_shader;
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
-
-				if(task.get_cancel() || task_pool.canceled())
-					break;
-
-				task.update_progress(NULL);
-			}
-		}
+		if(system_cpu_support_sse2())
+			shader_kernel = kernel_cpu_sse2_shader;
 		else
 #endif
-		{
-			for(int sample = 0; sample < task.num_samples; sample++) {
-				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					    task.shader_eval_type, x, task.offset, sample);
+			shader_kernel = kernel_cpu_shader;
 
-				if(task.get_cancel() || task_pool.canceled())
-					break;
+		for(int sample = 0; sample < task.num_samples; sample++) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+				shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					task.shader_eval_type, x, task.offset, sample);
+
+			if(task.get_cancel() || task_pool.canceled())
+				break;
+
+			task.update_progress(NULL);
 
-				task.update_progress(NULL);
-			}
 		}
 
 #ifdef WITH_OSL
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 44be7779891..7e622e03cdd 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -76,7 +76,7 @@ public:
 	{
 		if(first_error) {
 			fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-			fprintf(stderr, "http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/GPU_Rendering\n\n");
+			fprintf(stderr, "http://www.blender.org/manual/render/cycles/gpu_rendering.html\n\n");
 			first_error = false;
 		}
 	}
@@ -202,13 +202,9 @@ public:
 		/* compute cubin name */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
-		
-		/* workaround to make sm_52 cards work, until we bundle kernel */
-		if(major == 5 && minor == 2)
-			minor = 0;
+		string cubin;
 
 		/* attempt to use kernel provided with blender */
-		string cubin;
 		if(experimental)
 			cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
 		else
@@ -363,7 +359,7 @@ public:
 		cuda_push_context();
 		if(mem.device_pointer) {
 			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
-			                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size));
+			                         (CUdeviceptr)(mem.device_pointer + offset), size));
 		}
 		else {
 			memset((char*)mem.data_pointer + offset, 0, size);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index c521e1383a4..0ff227938ae 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -91,6 +91,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_magic.h
 	svm/svm_mapping.h
 	svm/svm_math.h
+	svm/svm_math_util.h
 	svm/svm_mix.h
 	svm/svm_musgrave.h
 	svm/svm_noise.h
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index ad7864cb8ea..b94bdeeb23f 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2014 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,24 +33,20 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = sc->data0;
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
@@ -73,7 +69,7 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 
 	float out = 0.0f;
 
-	if (NdotI > 0.0f && NdotO > 0.0f) {
+	if(NdotI > 0.0f && NdotO > 0.0f) {
 		NdotI = fmaxf(NdotI, 1e-6f);
 		NdotO = fmaxf(NdotO, 1e-6f);
 		float3 H = normalize(omega_in + I);
@@ -86,7 +82,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
 
-		if (n_x == n_y) {  /* => isotropic case */
+		if(n_x == n_y) {
+			/* isotropic */
 			float e = n_x;
 			float lobe = powf(HdotN, e);
 			float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
@@ -94,7 +91,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 			out = NdotO * norm * lobe * pump;
 			*pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
 		}
-		else {             /* => ANisotropic case */
+		else {
+			/* anisotropic */
 			float3 X, Y;
 			make_orthonormals_tangent(N, sc->T, &X, &Y);
 
@@ -130,7 +128,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 	float3 N = sc->N;
 
 	float NdotI = dot(N, I);
-	if (NdotI > 0.0f) {
+	if(NdotI > 0.0f) {
 
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
@@ -146,21 +144,23 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* sample spherical coords for h in tangent space */
 		float phi;
 		float cos_theta;
-		if (n_x == n_y) {  /* => simple isotropic sampling */
+		if(n_x == n_y) {
+			/* isotropic sampling */
 			phi = M_2PI_F * randu;
 			cos_theta = powf(randv, 1.0f / (n_x + 1.0f));
 		}
-		else {             /* => more complex anisotropic sampling */
-			if (randu < 0.25f) {      /* first quadrant */
+		else {
+			/* anisotropic sampling */
+			if(randu < 0.25f) {      /* first quadrant */
 				float remapped_randu = 4.0f * randu;
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 			}
-			else if (randu < 0.5f) {  /* second quadrant */
+			else if(randu < 0.5f) {  /* second quadrant */
 				float remapped_randu = 4.0f * (.5f - randu);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F - phi;
 			}
-			else if (randu < 0.75f) { /* third quadrant */
+			else if(randu < 0.75f) { /* third quadrant */
 				float remapped_randu = 4.0f * (randu - 0.5f);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F + phi;
@@ -185,13 +185,12 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* half vector to world space */
 		float3 H = h.x*X + h.y*Y + h.z*N;
 		float HdotI = dot(H, I);
-		if (HdotI < 0.0f) H = -H;
+		if(HdotI < 0.0f) H = -H;
 
 		/* reflect I on H to get omega_in */
 		*omega_in = -I + (2.0f * HdotI) * H;
 
 		/* leave the rest to eval_reflect */
-		/* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */
 		*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -201,7 +200,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}
 
-	return LABEL_REFLECT | LABEL_GLOSSY;
+	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 949fe869549..371f467000c 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -108,11 +108,6 @@ ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const
 	return make_float3 (cos_pi, cos_pi, cos_pi);
 }
 
-ccl_device float bsdf_translucent_albedo(const ShaderClosure *sc, const float3 I)
-{
-	return 1.0f;
-}
-
 ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index b856774375f..cdaf84f1750 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -52,7 +52,7 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index e0b5454592b..4f4fd5d26b8 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -49,7 +49,7 @@ ccl_device int bsdf_hair_reflection_setup(ShaderClosure *sc)
 	sc->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
@@ -57,7 +57,7 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 	sc->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 8737b0e2d94..9561885525f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -305,7 +305,7 @@ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
@@ -315,7 +315,7 @@ ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
@@ -325,7 +325,7 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
@@ -657,7 +657,7 @@ ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
@@ -666,7 +666,7 @@ ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
 	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
@@ -675,7 +675,7 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 6f685d5eeea..6d3b915c24a 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -25,7 +25,7 @@ ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3
 	float nv = max(dot(n, v), 0.0f);
 	float t = dot(l, v) - nl * nv;
 
-	if (t > 0.0f)
+	if(t > 0.0f)
 		t /= max(nl, nv) + FLT_MIN;
 	float is = nl * (sc->data0 + sc->data1 * t);
 	return make_float3(is, is, is);
@@ -44,7 +44,7 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 	sc->data0 = 1.0f * div;
 	sc->data1 = sigma * div;
 
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
@@ -53,7 +53,7 @@ ccl_device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
 
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	if (dot(sc->N, omega_in) > 0.0f) {
+	if(dot(sc->N, omega_in) > 0.0f) {
 		*pdf = 0.5f * M_1_PI_F;
 		return bsdf_oren_nayar_get_intensity(sc, sc->N, I, omega_in);
 	}
@@ -72,7 +72,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 {
 	sample_uniform_hemisphere(sc->N, randu, randv, omega_in, pdf);
 
-	if (dot(Ng, *omega_in) > 0.0f) {
+	if(dot(Ng, *omega_in) > 0.0f) {
 		*eval = bsdf_oren_nayar_get_intensity(sc, sc->N, I, *omega_in);
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -86,7 +86,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 		*eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
-	return LABEL_REFLECT | LABEL_DIFFUSE;
+	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 2b4e1c68640..f9f263719e9 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -54,7 +54,7 @@ ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 	sc->data0 = max(sc->data0, 0.0f);
 	
 	sc->type = CLOSURE_BSDF_PHONG_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
@@ -67,11 +67,11 @@ ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const fl
 	float cosNI = dot(sc->N, omega_in);
 	float cosNO = dot(sc->N, I);
 	
-	if (cosNI > 0 && cosNO > 0) {
+	if(cosNI > 0 && cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 		float cosRI = dot(R, omega_in);
-		if (cosRI > 0) {
+		if(cosRI > 0) {
 			float cosp = powf(cosRI, m_exponent);
 			float common = 0.5f * M_1_PI_F * cosp;
 			float out = cosNI * (m_exponent + 2) * common;
@@ -93,7 +93,7 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 	float cosNO = dot(sc->N, I);
 	float m_exponent = sc->data0;
 	
-	if (cosNO > 0) {
+	if(cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 
@@ -111,12 +111,12 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 		*omega_in = (cosf(phi) * sinTheta) * T +
 		            (sinf(phi) * sinTheta) * B +
 		            (            cosTheta) * R;
-		if (dot(Ng, *omega_in) > 0.0f)
+		if(dot(Ng, *omega_in) > 0.0f)
 		{
 			// common terms for pdf and eval
 			float cosNI = dot(sc->N, *omega_in);
 			// make sure the direction we chose is still in the right hemisphere
-			if (cosNI > 0)
+			if(cosNI > 0)
 			{
 				float cosp = powf(cosTheta, m_exponent);
 				float common = 0.5f * M_1_PI_F * cosp;
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
index 9dd8d226f5b..16c16beee39 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -277,6 +277,7 @@ ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 					}
 					else {
 						/* pop */
+						object = OBJECT_NONE;
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 5df6c75df86..207d5066fb2 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -144,7 +144,8 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 center;
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
+	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
 		if(!(sd->flag & SD_TRANSFORM_APPLIED))
@@ -170,6 +171,13 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 
 		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
 		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
+
+#ifdef __HAIR__
+		if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+			object_position_transform(kg, sd, &motion_pre);
+			object_position_transform(kg, sd, &motion_post);
+		}
+#endif
 	}
 
 	/* object motion. note that depending on the mesh having motion vectors, this
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 33a20494966..3cb6d168f80 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -52,7 +52,11 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = 0.0f;
@@ -68,7 +72,11 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index a1ec080e3d3..e80bfb32e89 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -198,10 +198,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, 0);
+	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
 
 #if 0
-	uint rng_state = cmj_hash(i, 0);
+	uint rng_state = cmj_hash(i, kernel_data.integrator.seed);
 	float filter_x, filter_y;
 	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
 
@@ -253,6 +253,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
+			if ((sd.flag & SD_HAS_BUMP)) {
+				shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			}
+
 			/* compression: normal = (2 * color) - 1 */
 			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
 			break;
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 403fd0a67f7..08c8bdd369d 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -25,10 +25,12 @@
 #include "util_half.h"
 #include "util_types.h"
 
-/* On 64bit linux single precision exponent is really slow comparing to the
- * double precision version, even with float<->double conversion involved.
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
  */
-#if !defined(__KERNEL_GPU__) && defined(__linux__) && defined(__x86_64__)
+#if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
+     defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
+     (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
 #  define expf(x) ((float)exp((double)(x)))
 #endif
 
@@ -151,6 +153,13 @@ template<typename T> struct texture_image  {
 
 	ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
 	{
+		return interp_3d_ex(x, y, z, interpolation, periodic);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR,
+	                                      bool periodic = false)
+	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
@@ -174,7 +183,7 @@ template<typename T> struct texture_image  {
 
 			return read(data[ix + iy*width + iz*width*height]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 			float tz = frac(z*(float)depth - 0.5f, &iz);
@@ -212,6 +221,93 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				pix = wrap_periodic(ix-1, width);
+				piy = wrap_periodic(iy-1, height);
+				piz = wrap_periodic(iz-1, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+
+				nnix = wrap_periodic(ix+2, width);
+				nniy = wrap_periodic(iy+2, height);
+				nniz = wrap_periodic(iz+2, depth);
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+
+				pix = wrap_clamp(ix-1, width);
+				piy = wrap_clamp(iy-1, height);
+				piz = wrap_clamp(iz-1, depth);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				nnix = wrap_clamp(ix+2, width);
+				nniy = wrap_clamp(iy+2, height);
+				nniz = wrap_clamp(iz+2, depth);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define SET_SPLINE_WEIGHTS(u, t) \
+			{ \
+				u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+				u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+				u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+				u[3] = (1.0f / 6.0f) * t * t * t; \
+			} (void)0
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_SPLINE_WEIGHTS(u, tx);
+			SET_SPLINE_WEIGHTS(v, ty);
+			SET_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+#undef SET_SPLINE_WEIGHTS
+		}
 	}
 
 	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
@@ -244,6 +340,7 @@ typedef texture_image<uchar4> texture_image_uchar4;
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
 
 #define kernel_data (kg->__data)
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index e7f62f230f8..9dfbfd91881 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -167,12 +167,143 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ */
+ccl_device float3 area_light_sample(float3 P,
+                                    float3 light_p,
+                                    float3 axisu, float3 axisv,
+                                    float randu, float randv,
+                                    float *pdf)
 {
-	randu = randu - 0.5f;
-	randv = randv - 0.5f;
+	/* In our name system we're using P for the center,
+	 * which is o in the paper.
+	 */
+
+	float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float z0sq = z0 * z0;
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	float y0sq = y0 * y0;
+	float y1sq = y1 * y1;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = safe_acosf(-dot(n0, n1));
+	float g1 = safe_acosf(-dot(n1, n2));
+	float g2 = safe_acosf(-dot(n2, n3));
+	float g3 = safe_acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float b0 = n0.z;
+	float b1 = n2.z;
+	float b0sq = b0 * b0;
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+
+	/* Compute cu. */
+	float au = randu * S + k;
+	float fu = (cosf(au) * b0 - b1) / sinf(au);
+	float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+	cu = clamp(cu, -1.0f, 1.0f);
+	/* Compute xu. */
+	float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+	xu = clamp(xu, x0, x1);
+	/* Compute yv. */
+	float d = sqrtf(xu * xu + z0sq);
+	float h0 = y0 / sqrtf(d * d + y0sq);
+	float h1 = y1 / sqrtf(d * d + y1sq);
+	float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+	float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+	if(S != 0.0f)
+		*pdf = 1.0f / S;
+	else
+		*pdf = 0.0f;
+
+	/* Transform (xu, yv, z0) to world coords. */
+	return P + xu * x + yv * y + z0 * z;
+}
 
-	return axisu*randu + axisv*randv;
+/* TODO(sergey): This is actually a duplicated code from above, but how to avoid
+ * this without having some nasty function with loads of parameters?
+ */
+ccl_device float area_light_pdf(float3 P,
+                                float3 light_p,
+                                float3 axisu, float3 axisv)
+{
+	/* In our name system we're using P for the center,
+	 * which is o in the paper.
+	 */
+
+	float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = safe_acosf(-dot(n0, n1));
+	float g1 = safe_acosf(-dot(n1, n2));
+	float g2 = safe_acosf(-dot(n2, n3));
+	float g3 = safe_acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+	if(S != 0.0f)
+		return 1.0f / S;
+	else
+		return 0.0f;
 }
 
 ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
@@ -276,6 +407,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
 				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
 			}
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
 			/* area light */
@@ -286,18 +418,22 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 			float3 axisv = make_float3(data2.y, data2.z, data2.w);
 			float3 D = make_float3(data3.y, data3.z, data3.w);
 
-			ls->P += area_light_sample(axisu, axisv, randu, randv);
+			ls->P = area_light_sample(P, ls->P,
+			                          axisu, axisv,
+			                          randu, randv,
+			                          &ls->pdf);
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
 			float invarea = data2.x;
-
 			ls->eval_fac = 0.25f*invarea;
-			ls->pdf = invarea;
+
+			if(dot(ls->D, D) > 0.0f)
+				ls->pdf = 0.0f;
 		}
 
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 }
 
@@ -355,6 +491,7 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->D = D;
 		ls->t = FLT_MAX;
 
+		/* compute pdf */
 		float invarea = data1.w;
 		ls->pdf = invarea/(costheta*costheta*costheta);
 		ls->eval_fac = ls->pdf;
@@ -386,6 +523,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+
+		/* compute pdf */
+		if(ls->t != FLT_MAX)
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
@@ -412,16 +553,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 		ls->D = D;
 		ls->Ng = Ng;
-		ls->pdf = invarea;
-		ls->eval_fac = 0.25f*ls->pdf;
+		ls->pdf = area_light_pdf(P, ls->P, axisu, axisv);
+		ls->eval_fac = 0.25f*invarea;
 	}
 	else
 		return false;
 
-	/* compute pdf */
-	if(ls->t != FLT_MAX)
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-
 	return true;
 }
 
@@ -514,7 +651,13 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+{
+	float4 data4 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 4);
+	return (bounce > __float_as_int(data4.x));
+}
+
+ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, int bounce, LightSample *ls)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -536,6 +679,12 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 	}
 	else {
 		int lamp = -prim-1;
+
+		if(UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
 		lamp_light_sample(kg, lamp, randu, randv, P, ls);
 	}
 }
@@ -546,22 +695,5 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
-{
-	/* sample index */
-	int index = light_distribution_sample(kg, randt);
-
-	/* fetch light data */
-	float4 l = kernel_tex_fetch(__light_distribution, index);
-	int prim = __float_as_int(l.y);
-
-	if(prim < 0) {
-		int lamp = -prim-1;
-		return lamp;
-	}
-	else
-		return LAMP_NONE;
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 9553c2da0df..e5ba1f41c47 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -38,6 +38,9 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+			   continue;
+
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
@@ -82,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -103,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -200,7 +203,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index da2d5e6eca8..11d3d94657b 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -40,7 +40,7 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	light_ray.time = sd->time;
 #endif
 
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 	if(ls.pdf == 0.0f)
 		return;
 	
@@ -56,7 +56,12 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 #endif
 }
 
-ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
 {
 	/* sample phase function */
@@ -119,6 +124,9 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+				continue;
+
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
@@ -183,7 +191,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
@@ -198,7 +206,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 				if(ls.pdf == 0.0f)
 					continue;
@@ -222,7 +230,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
@@ -237,7 +245,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 		if(ls.pdf == 0.0f)
 			return;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index db08c328d7e..65089740ef9 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -681,7 +681,7 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 	for(int i = 0; i< sd->num_closure; i++) {
 		ShaderClosure *sc = &sd->closure[i];
 
-		if(CLOSURE_IS_BSSRDF(sc->type))
+		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
 	}
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index ca1210f2d80..460ca7b68eb 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 /* constants */
 #define OBJECT_SIZE 		11
 #define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE			4
+#define LIGHT_SIZE			5
 #define FILTER_TABLE_SIZE	256
 #define RAMP_TABLE_SIZE		256
 #define PARTICLE_SIZE 		5
@@ -291,34 +291,34 @@ typedef enum ClosureLabel {
 
 typedef enum PassType {
 	PASS_NONE = 0,
-	PASS_COMBINED = 1,
-	PASS_DEPTH = 2,
-	PASS_NORMAL = 4,
-	PASS_UV = 8,
-	PASS_OBJECT_ID = 16,
-	PASS_MATERIAL_ID = 32,
-	PASS_DIFFUSE_COLOR = 64,
-	PASS_GLOSSY_COLOR = 128,
-	PASS_TRANSMISSION_COLOR = 256,
-	PASS_DIFFUSE_INDIRECT = 512,
-	PASS_GLOSSY_INDIRECT = 1024,
-	PASS_TRANSMISSION_INDIRECT = 2048,
-	PASS_DIFFUSE_DIRECT = 4096,
-	PASS_GLOSSY_DIRECT = 8192,
-	PASS_TRANSMISSION_DIRECT = 16384,
-	PASS_EMISSION = 32768,
-	PASS_BACKGROUND = 65536,
-	PASS_AO = 131072,
-	PASS_SHADOW = 262144,
-	PASS_MOTION = 524288,
-	PASS_MOTION_WEIGHT = 1048576,
-	PASS_MIST = 2097152,
-	PASS_SUBSURFACE_DIRECT = 4194304,
-	PASS_SUBSURFACE_INDIRECT = 8388608,
-	PASS_SUBSURFACE_COLOR = 16777216,
-	PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
+	PASS_COMBINED = (1 << 0),
+	PASS_DEPTH = (1 << 1),
+	PASS_NORMAL = (1 << 2),
+	PASS_UV = (1 << 3),
+	PASS_OBJECT_ID = (1 << 4),
+	PASS_MATERIAL_ID = (1 << 5),
+	PASS_DIFFUSE_COLOR = (1 << 6),
+	PASS_GLOSSY_COLOR = (1 << 7),
+	PASS_TRANSMISSION_COLOR = (1 << 8),
+	PASS_DIFFUSE_INDIRECT = (1 << 9),
+	PASS_GLOSSY_INDIRECT = (1 << 10),
+	PASS_TRANSMISSION_INDIRECT = (1 << 11),
+	PASS_DIFFUSE_DIRECT = (1 << 12),
+	PASS_GLOSSY_DIRECT = (1 << 13),
+	PASS_TRANSMISSION_DIRECT = (1 << 14),
+	PASS_EMISSION = (1 << 15),
+	PASS_BACKGROUND = (1 << 16),
+	PASS_AO = (1 << 17),
+	PASS_SHADOW = (1 << 18),
+	PASS_MOTION = (1 << 19),
+	PASS_MOTION_WEIGHT = (1 << 20),
+	PASS_MIST = (1 << 21),
+	PASS_SUBSURFACE_DIRECT = (1 << 22),
+	PASS_SUBSURFACE_INDIRECT = (1 << 23),
+	PASS_SUBSURFACE_COLOR = (1 << 24),
+	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
 #ifdef __KERNEL_DEBUG__
-	PASS_BVH_TRAVERSAL_STEPS = 67108864,
+	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
 #endif
 } PassType;
 
@@ -539,34 +539,25 @@ typedef enum AttributeStandard {
 #define MAX_CLOSURE 1
 #endif
 
-/* TODO(sergey): This is rather nasty bug happening in here, which
- * could be simply a compilers bug for which we can't find a generic
- * platform independent workaround. Also even if it's a compiler
- * issue, it's not so simple to upgrade the compiler in the release
- * environment for linux and doing it so closer to the release is
- * rather a risky business.
- *
- * For this release it's probably safer to stick with such a rather
- * dirty solution, and look for a cleaner fix during the next release
- * cycle.
+/* This struct is to be 16 bytes aligned, we also keep some extra precautions:
+ * - All the float3 members are in the beginning of the struct, so compiler
+ *   does not put own padding trying to align this members.
+ * - We make sure OSL pointer is also 16 bytes aligned.
  */
 typedef struct ShaderClosure {
-	ClosureType type;
 	float3 weight;
-#ifndef __APPLE__
+	float3 N;
+	float3 T;
+
+	ClosureType type;
 	float sample_weight;
-#endif
 	float data0;
 	float data1;
 	float data2;
+	int pad1, pad2, pad3;
 
-	float3 N;
-	float3 T;
-#ifdef __APPLE__
-	float sample_weight;
-#endif
 #ifdef __OSL__
-	void *prim;
+	void *prim, *pad4;
 #endif
 } ShaderClosure;
 
@@ -591,43 +582,46 @@ typedef enum ShaderContext {
 
 enum ShaderDataFlag {
 	/* runtime flags */
-	SD_BACKFACING = 1,		/* backside of surface? */
-	SD_EMISSION = 2,		/* have emissive closure? */
-	SD_BSDF = 4,			/* have bsdf closure? */
-	SD_BSDF_HAS_EVAL = 8,	/* have non-singular bsdf closure? */
-	SD_PHASE_HAS_EVAL = 8,	/* have non-singular phase closure? */
-	SD_BSDF_GLOSSY = 16,	/* have glossy bsdf */
-	SD_BSSRDF = 32,			/* have bssrdf */
-	SD_HOLDOUT = 64,		/* have holdout closure? */
-	SD_ABSORPTION = 128,	/* have volume absorption closure? */
-	SD_SCATTER = 256,		/* have volume phase closure? */
-	SD_AO = 512,			/* have ao closure? */
-	SD_TRANSPARENT = 1024,	/* have transparent closure? */
-
-	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|
-	                    SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
+	SD_BACKFACING     = (1 << 0),   /* backside of surface? */
+	SD_EMISSION       = (1 << 1),   /* have emissive closure? */
+	SD_BSDF           = (1 << 2),   /* have bsdf closure? */
+	SD_BSDF_HAS_EVAL  = (1 << 3),   /* have non-singular bsdf closure? */
+	SD_PHASE_HAS_EVAL = (1 << 3),   /* have non-singular phase closure? */
+	SD_BSSRDF         = (1 << 4),   /* have bssrdf */
+	SD_HOLDOUT        = (1 << 5),   /* have holdout closure? */
+	SD_ABSORPTION     = (1 << 6),   /* have volume absorption closure? */
+	SD_SCATTER        = (1 << 7),   /* have volume phase closure? */
+	SD_AO             = (1 << 8),   /* have ao closure? */
+	SD_TRANSPARENT    = (1 << 9),  /* have transparent closure? */
+
+	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF|
+	                    SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
 
 	/* shader flags */
-	SD_USE_MIS = 2048,					/* direct light sample */
-	SD_HAS_TRANSPARENT_SHADOW = 4096,	/* has transparent shadow */
-	SD_HAS_VOLUME = 8192,				/* has volume shader */
-	SD_HAS_ONLY_VOLUME = 16384,			/* has only volume shader, no surface */
-	SD_HETEROGENEOUS_VOLUME = 32768,	/* has heterogeneous volume */
-	SD_HAS_BSSRDF_BUMP = 65536,			/* bssrdf normal uses bump */
-	SD_VOLUME_EQUIANGULAR = 131072,		/* use equiangular sampling */
-	SD_VOLUME_MIS = 262144,				/* use multiple importance sampling */
+	SD_USE_MIS                = (1 << 10),  /* direct light sample */
+	SD_HAS_TRANSPARENT_SHADOW = (1 << 11),  /* has transparent shadow */
+	SD_HAS_VOLUME             = (1 << 12),  /* has volume shader */
+	SD_HAS_ONLY_VOLUME        = (1 << 13),  /* has only volume shader, no surface */
+	SD_HETEROGENEOUS_VOLUME   = (1 << 14),  /* has heterogeneous volume */
+	SD_HAS_BSSRDF_BUMP        = (1 << 15),  /* bssrdf normal uses bump */
+	SD_VOLUME_EQUIANGULAR     = (1 << 16),  /* use equiangular sampling */
+	SD_VOLUME_MIS             = (1 << 17),  /* use multiple importance sampling */
+	SD_VOLUME_CUBIC           = (1 << 18),  /* use cubic interpolation for voxels */
+	SD_HAS_BUMP               = (1 << 19),  /* has data connected to the displacement input */
 
 	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
 	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
-					   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS),
+	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
+	                   SD_VOLUME_CUBIC|SD_HAS_BUMP),
 
 	/* object flags */
-	SD_HOLDOUT_MASK = 524288,			/* holdout for camera rays */
-	SD_OBJECT_MOTION = 1048576,			/* has object motion blur */
-	SD_TRANSFORM_APPLIED = 2097152,		/* vertices have transform applied */
-	SD_NEGATIVE_SCALE_APPLIED = 4194304,	/* vertices have negative scale applied */
-	SD_OBJECT_HAS_VOLUME = 8388608,		/* object has a volume shader */
-	SD_OBJECT_INTERSECTS_VOLUME = 16777216, /* object intersects AABB of an object with volume shader */
+	SD_HOLDOUT_MASK             = (1 << 20),  /* holdout for camera rays */
+	SD_OBJECT_MOTION            = (1 << 21),  /* has object motion blur */
+	SD_TRANSFORM_APPLIED        = (1 << 22),  /* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED   = (1 << 23),  /* vertices have negative scale applied */
+	SD_OBJECT_HAS_VOLUME        = (1 << 24),  /* object has a volume shader */
+	SD_OBJECT_INTERSECTS_VOLUME = (1 << 25),  /* object intersects AABB of an object with volume shader */
+	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 21),  /* has position for motion vertices */
 
 	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
 	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
@@ -994,7 +988,7 @@ typedef struct KernelData {
 
 #ifdef __KERNEL_DEBUG__
 typedef struct DebugData {
-	// Total number of BVH node travesal steps and primitives intersections
+	// Total number of BVH node traversal steps and primitives intersections
 	// for the camera rays.
 	int num_bvh_traversal_steps;
 } DebugData;
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 93cb4c120ea..6a39ba928f0 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -581,7 +581,8 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
  * through a volume. This can then latter be used for decoupled sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media"
  *
- * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * On the GPU this is only supported (but currently not enabled)
+ * for homogeneous volumes (1 step), due to
  * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
@@ -595,6 +596,7 @@ typedef struct VolumeStep {
 } VolumeStep;
 
 typedef struct VolumeSegment {
+	VolumeStep stack_step;      /* stack storage for homogeneous step, to avoid malloc */
 	VolumeStep *steps;			/* recorded steps */
 	int numsteps;				/* number of steps */
 	int closure_flag;			/* accumulated closure flags from all steps */
@@ -627,11 +629,13 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
+		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 	}
 	else {
 		max_steps = 1;
 		step_size = ray->t;
 		random_jitter_offset = 0.0f;
+		segment->steps = &segment->stack_step;
 	}
 	
 	/* init accumulation variables */
@@ -640,10 +644,8 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 	float t = 0.0f;
 
-	segment->closure_flag = 0;
 	segment->numsteps = 0;
-
-	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+	segment->closure_flag = 0;
 
 	VolumeStep *step = segment->steps;
 
@@ -729,16 +731,13 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
-	free(segment->steps);
+	if(segment->steps != &segment->stack_step)
+		free(segment->steps);
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching. unlike the non-decoupled functions, these do not do probalistic
- * scattering, they always scatter if there is any non-zero scattering
- * coefficient.
+ * marching. this function does not do emission or modify throughput. 
  *
- * these also do not do emission or modify throughput. 
- * 
  * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
@@ -958,7 +957,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          Ray *ray,
                                          VolumeStack *stack)
 {
-	/* NULL ray happens in the baker, does it need proper initializetion of
+	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
 	 */
 	if(!kernel_data.cam.is_inside_volume || ray == NULL) {
@@ -992,31 +991,29 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		if(sd.flag & SD_HAS_VOLUME) {
-			if(sd.flag & SD_BACKFACING) {
-				/* If ray exited the volume and never entered to that volume
-				 * it means that camera is inside such a volume.
-				 */
-				bool is_enclosed = false;
-				for(int i = 0; i < enclosed_index; ++i) {
-					if(enclosed_volumes[i] == sd.object) {
-						is_enclosed = true;
-						break;
-					}
-				}
-				if(is_enclosed == false) {
-					stack[stack_index].object = sd.object;
-					stack[stack_index].shader = sd.shader;
-					++stack_index;
+		if(sd.flag & SD_BACKFACING) {
+			/* If ray exited the volume and never entered to that volume
+			 * it means that camera is inside such a volume.
+			 */
+			bool is_enclosed = false;
+			for(int i = 0; i < enclosed_index; ++i) {
+				if(enclosed_volumes[i] == sd.object) {
+					is_enclosed = true;
+					break;
 				}
 			}
-			else {
-				/* If ray from camera enters the volume, this volume shouldn't
-				 * be added to the stak on exit.
-				 */
-				enclosed_volumes[enclosed_index++] = sd.object;
+			if(is_enclosed == false) {
+				stack[stack_index].object = sd.object;
+				stack[stack_index].shader = sd.shader;
+				++stack_index;
 			}
 		}
+		else {
+			/* If ray from camera enters the volume, this volume shouldn't
+			 * be added to the stack on exit.
+			 */
+			enclosed_volumes[enclosed_index++] = sd.object;
+		}
 
 		/* Move ray forward. */
 		volume_ray.P = ray_offset(sd.P, -sd.Ng);
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index d721edbaf6e..0a21d3e6819 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -38,6 +38,7 @@ incs.append(env['BF_OIIO_INC'])
 incs.append(env['BF_BOOST_INC'])
 incs.append(env['BF_OSL_INC'])
 incs.append(env['BF_OPENEXR_INC'].split())
+incs.append('#/intern/atomic')
 
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
@@ -46,6 +47,16 @@ defs.append('WITH_OSL')
 if env['WITH_BF_CYCLES_DEBUG']:
     defs.append('WITH_CYCLES_DEBUG')
 
+if env['WITH_BF_CYCLES_LOGGING']:
+    defs.append('WITH_CYCLES_LOGGING')
+    defs.append('GOOGLE_GLOG_DLL_DECL=')
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+        incs.append('#extern/libmv/third_party/glog/src/windows')
+        incs.append('#extern/libmv/third_party/gflags')
+    else:
+        incs.append('#extern/libmv/third_party/glog/src')
+        incs.append('#extern/libmv/third_party/gflags')
+
 if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
     cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split())
     incs.append(env['BF_PTHREADS_INC'])
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index cc9942b024e..1d99f1d2682 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -189,11 +189,7 @@ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, O
 	/* optimization: it's possible to not use a prepare function at all and
 	 * only initialize the actual class when accessing the closure component
 	 * data, but then we need to map the id to the class somehow */
-#ifdef CLOSURE_PREPARE
-	ss->register_closure(name, id, params, prepare, NULL, NULL);
-#else
-	ss->register_closure(name, id, params, prepare, NULL);
-#endif
+	ss->register_closure(name, id, params, prepare, NULL, 16);
 }
 
 void OSLShader::register_closures(OSLShadingSystem *ss_)
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index a9694651e14..7a93aa05222 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -834,7 +834,7 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo
 bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
                                 OSL::ShaderGlobals *sg,
                                 float s, float t, float dsdx, float dtdx,
-                                float dsdy, float dtdy, float *result)
+                                float dsdy, float dtdy, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -869,9 +869,9 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 		PtexFilter::Options opts(PtexFilter::f_bicubic, mipmaplerp, sharpness);
 		PtexPtr<PtexFilter> f(PtexFilter::getFilter(r, opts));
 
-		f->eval(result, options.firstchannel, options.nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
+		f->eval(result, options.firstchannel, nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
 
-		for(int c = r->numChannels(); c < options.nchannels; c++)
+		for(int c = r->numChannels(); c < nchannels; c++)
 			result[c] = result[0];
 
 		return true;
@@ -880,15 +880,15 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 	bool status;
 
 	if(filename[0] == '@' && filename.find('.') == -1) {
-        int slot = atoi(filename.c_str() + 1);
+		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
 		result[0] = rgba[0];
-		if(options.nchannels > 1)
+		if(nchannels > 1)
 			result[1] = rgba[1];
-		if(options.nchannels > 2)
+		if(nchannels > 2)
 			result[2] = rgba[2];
-		if(options.nchannels > 3)
+		if(nchannels > 3)
 			result[3] = rgba[3];
 		status = true;
 	}
@@ -898,17 +898,24 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 
 		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
 
+#if OIIO_VERSION < 10500
 		status = ts->texture(th, thread_info,
-		                     options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     result);
+#else
+		status = ts->texture(th, thread_info,
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     nchannels, result);
+#endif
 	}
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
@@ -919,7 +926,7 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
                                   OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
                                   const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-                                  const OSL::Vec3 &dPdz, float *result)
+                                  const OSL::Vec3 &dPdz, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -929,16 +936,22 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
 
 	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
 
+#if OIIO_VERSION < 10500
 	bool status = ts->texture3d(th, thread_info,
 	                            options, P, dPdx, dPdy, dPdz, result);
+#else
+	bool status = ts->texture3d(th, thread_info,
+	                            options, P, dPdx, dPdy, dPdz,
+	                            nchannels, result);
+#endif
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 
@@ -949,7 +962,8 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
 
 bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
                                     OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result)
+                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+                                    int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -958,16 +972,23 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
 
 	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+
+#if OIIO_VERSION < 10500
 	bool status = ts->environment(th, thread_info,
 	                              options, R, dRdx, dRdy, result);
+#else
+	bool status = ts->environment(th, thread_info,
+	                              options, R, dRdx, dRdy,
+	                              nchannels, result);
+#endif
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 6f928a0d103..e9026d95f34 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -97,16 +97,17 @@ public:
 	bool texture(ustring filename, TextureOpt &options,
 	             OSL::ShaderGlobals *sg,
 	             float s, float t, float dsdx, float dtdx,
-	             float dsdy, float dtdy, float *result);
+	             float dsdy, float dtdy, int nchannels, float *result);
 
 	bool texture3d(ustring filename, TextureOpt &options,
 	               OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
 	               const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-	               const OSL::Vec3 &dPdz, float *result);
+	               const OSL::Vec3 &dPdz, int nchannels, float *result);
 
 	bool environment(ustring filename, TextureOpt &options,
 	                 OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
+	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                 int nchannels, float *result);
 
 	bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
 	                      ustring dataname, TypeDesc datatype, void *data);
@@ -159,70 +160,37 @@ public:
 	static ustring u_v;
 	static ustring u_empty;
 
-#if OSL_LIBRARY_VERSION_CODE < 10500
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_matrix(NULL, result, xform, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_inverse_matrix(NULL, result, xform, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
-		return get_matrix(NULL, result, from, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
-		return get_inverse_matrix(NULL, result, to, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_matrix(NULL, result, xform);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_inverse_matrix(NULL, result, xform);
-	}
+	/* Code to make OSL versions transition smooth. */
 
-	bool get_matrix(OSL::Matrix44 &result, ustring from) {
-		return get_matrix(NULL, result, from);
+#if OSL_LIBRARY_VERSION_CODE < 10600
+	inline bool texture(ustring filename, TextureOpt &options,
+	                    OSL::ShaderGlobals *sg,
+	                    float s, float t, float dsdx, float dtdx,
+	                    float dsdy, float dtdy, float *result)
+	{
+		return texture(filename, options, sg, s, t, dsdx, dtdx, dsdy, dtdy,
+		               options.nchannels, result);
 	}
 
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
-		return get_inverse_matrix(NULL, result, to);
+	inline bool texture3d(ustring filename, TextureOpt &options,
+	                      OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
+	                      const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
+	                      const OSL::Vec3 &dPdz, float *result)
+	{
+		return texture3d(filename, options, sg, P, dPdx, dPdy, dPdz,
+		                 options.nchannels, result);
 	}
 
-	bool get_array_attribute(void *renderstate, bool derivatives,
-	                         ustring object, TypeDesc type, ustring name,
-	                         int index, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_array_attribute(&sg, derivatives,
-		                           object, type, name,
-		                           index, val);
-	}
-
-	bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
-	                   TypeDesc type, ustring name, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_attribute(&sg, derivatives, object_name, type, name, val);
-	}
-
-	bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
-		return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
-	}
-
-	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
-	                  void *renderstate, void *val) {
-		return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
-	}
-
-	bool get_texture_info(ustring filename, int subimage,
-	                      ustring dataname, TypeDesc datatype, void *data) {
-		return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+	inline bool environment(ustring filename, TextureOpt &options,
+	                        OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
+	                        const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                        float *result)
+	{
+		return environment(filename, options, sg, R, dRdx, dRdy,
+		                   options.nchannels, result);
 	}
 #endif
+
 private:
 	KernelGlobals *kernel_globals;
 	OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/shaders/node_combine_hsv.osl b/intern/cycles/kernel/shaders/node_combine_hsv.osl
index 010773acc5c..574bad30b14 100644
--- a/intern/cycles/kernel/shaders/node_combine_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_combine_hsv.osl
@@ -15,6 +15,7 @@
  */
 
 #include "stdosl.h"
+#include "node_color.h"
 
 shader node_combine_hsv(
 	float H = 0.0,
@@ -22,6 +23,6 @@ shader node_combine_hsv(
 	float V = 0.0,
 	output color Color = 0.8)
 {
-	Color = color("hsv", H, S, V);	
+	Color = color_srgb_to_scene_linear(color("hsv", H, S, V));
 }
 
diff --git a/intern/cycles/kernel/shaders/node_hsv.osl b/intern/cycles/kernel/shaders/node_hsv.osl
index 4722bde4cd7..5f4300ee31d 100644
--- a/intern/cycles/kernel/shaders/node_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_hsv.osl
@@ -35,6 +35,11 @@ shader node_hsv(
 
 	Color = hsv_to_rgb(Color);
 
+	// Clamp color to prevent negative values cauzed by oversaturation.
+	Color[0] = max(Color[0], 0.0);
+	Color[1] = max(Color[1], 0.0);
+	Color[2] = max(Color[2], 0.0);
+
 	ColorOut = mix(ColorIn, Color, Fac);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_normal.osl b/intern/cycles/kernel/shaders/node_normal.osl
index 14af044e0c0..002eddb574c 100644
--- a/intern/cycles/kernel/shaders/node_normal.osl
+++ b/intern/cycles/kernel/shaders/node_normal.osl
@@ -23,6 +23,6 @@ shader node_normal(
 	output float Dot = 1.0)
 {
 	NormalOut = normalize(Direction);
-	Dot = dot(NormalOut, NormalIn);
+	Dot = dot(NormalOut, normalize(NormalIn));
 }
 
diff --git a/intern/cycles/kernel/shaders/node_separate_hsv.osl b/intern/cycles/kernel/shaders/node_separate_hsv.osl
index 94fc5de9122..8bfb04aea1c 100644
--- a/intern/cycles/kernel/shaders/node_separate_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_separate_hsv.osl
@@ -23,7 +23,7 @@ shader node_separate_hsv(
 	output float S = 0.0,
 	output float V = 0.0)
 {
-	color col = rgb_to_hsv(Color);
+	color col = rgb_to_hsv(color_scene_linear_to_srgb(Color));
 	
 	H = col[0];
 	S = col[1];
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 1ff8f363b49..6babe98717c 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -505,6 +505,47 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
+// OSL 1.5 Microfacet functions
+closure color microfacet(string distribution, normal N, vector U, float xalpha, float yalpha, float eta, int refract) {
+	/* GGX */
+	if (distribution == "ggx" || distribution == "default") {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_ggx(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_ggx_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_ggx_refraction(N, xalpha, eta);
+		}
+	}
+	/* Beckmann */
+	else {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_beckmann(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_beckmann_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_beckmann_refraction(N, xalpha, eta);
+		}
+	}
+}
+
+closure color microfacet (string distribution, normal N, float alpha, float eta, int refract) {
+	return microfacet(distribution, N, vector(0), alpha, alpha, eta, refract);
+}
+
+
 // Renderer state
 int backfacing () BUILTIN;
 int raytype (string typename) BUILTIN;
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index c13eae813d6..5acfbbf972b 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -164,6 +164,7 @@ CCL_NAMESPACE_END
 #include "svm_mapping.h"
 #include "svm_normal.h"
 #include "svm_wave.h"
+#include "svm_math_util.h"
 #include "svm_math.h"
 #include "svm_mix.h"
 #include "svm_ramp.h"
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 11dfc4f096b..a02d853be1a 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -46,6 +46,11 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 	color.y = fac*color.y + (1.0f - fac)*in_color.y;
 	color.z = fac*color.z + (1.0f - fac)*in_color.z;
 
+	/* Clamp color to prevent negative values cauzed by oversaturation. */
+	color.x = max(color.x, 0.0f);
+	color.y = max(color.y, 0.0f);
+	color.z = max(color.z, 0.0f);
+
 	if (stack_valid(out_color_offset))
 		stack_store_float3(stack, out_color_offset, color);
 }
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 1ce9386e40e..e3d8c1f3242 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,56 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
-{
-	float Fac;
-
-	if(type == NODE_MATH_ADD)
-		Fac = Fac1 + Fac2;
-	else if(type == NODE_MATH_SUBTRACT)
-		Fac = Fac1 - Fac2;
-	else if(type == NODE_MATH_MULTIPLY)
-		Fac = Fac1*Fac2;
-	else if(type == NODE_MATH_DIVIDE)
-		Fac = safe_divide(Fac1, Fac2);
-	else if(type == NODE_MATH_SINE)
-		Fac = sinf(Fac1);
-	else if(type == NODE_MATH_COSINE)
-		Fac = cosf(Fac1);
-	else if(type == NODE_MATH_TANGENT)
-		Fac = tanf(Fac1);
-	else if(type == NODE_MATH_ARCSINE)
-		Fac = safe_asinf(Fac1);
-	else if(type == NODE_MATH_ARCCOSINE)
-		Fac = safe_acosf(Fac1);
-	else if(type == NODE_MATH_ARCTANGENT)
-		Fac = atanf(Fac1);
-	else if(type == NODE_MATH_POWER)
-		Fac = safe_powf(Fac1, Fac2);
-	else if(type == NODE_MATH_LOGARITHM)
-		Fac = safe_logf(Fac1, Fac2);
-	else if(type == NODE_MATH_MINIMUM)
-		Fac = fminf(Fac1, Fac2);
-	else if(type == NODE_MATH_MAXIMUM)
-		Fac = fmaxf(Fac1, Fac2);
-	else if(type == NODE_MATH_ROUND)
-		Fac = floorf(Fac1 + 0.5f);
-	else if(type == NODE_MATH_LESS_THAN)
-		Fac = Fac1 < Fac2;
-	else if(type == NODE_MATH_GREATER_THAN)
-		Fac = Fac1 > Fac2;
-	else if(type == NODE_MATH_MODULO)
-		Fac = safe_modulo(Fac1, Fac2);
-    else if(type == NODE_MATH_ABSOLUTE)
-        Fac = fabsf(Fac1);
-	else if(type == NODE_MATH_CLAMP)
-		Fac = clamp(Fac1, 0.0f, 1.0f);
-	else
-		Fac = 0.0f;
-	
-	return Fac;
-}
-
 ccl_device float average_fac(float3 v)
 {
 	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
new file mode 100644
index 00000000000..b813bf531dc
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
+{
+	float Fac;
+
+	if(type == NODE_MATH_ADD)
+		Fac = Fac1 + Fac2;
+	else if(type == NODE_MATH_SUBTRACT)
+		Fac = Fac1 - Fac2;
+	else if(type == NODE_MATH_MULTIPLY)
+		Fac = Fac1*Fac2;
+	else if(type == NODE_MATH_DIVIDE)
+		Fac = safe_divide(Fac1, Fac2);
+	else if(type == NODE_MATH_SINE)
+		Fac = sinf(Fac1);
+	else if(type == NODE_MATH_COSINE)
+		Fac = cosf(Fac1);
+	else if(type == NODE_MATH_TANGENT)
+		Fac = tanf(Fac1);
+	else if(type == NODE_MATH_ARCSINE)
+		Fac = safe_asinf(Fac1);
+	else if(type == NODE_MATH_ARCCOSINE)
+		Fac = safe_acosf(Fac1);
+	else if(type == NODE_MATH_ARCTANGENT)
+		Fac = atanf(Fac1);
+	else if(type == NODE_MATH_POWER)
+		Fac = safe_powf(Fac1, Fac2);
+	else if(type == NODE_MATH_LOGARITHM)
+		Fac = safe_logf(Fac1, Fac2);
+	else if(type == NODE_MATH_MINIMUM)
+		Fac = fminf(Fac1, Fac2);
+	else if(type == NODE_MATH_MAXIMUM)
+		Fac = fmaxf(Fac1, Fac2);
+	else if(type == NODE_MATH_ROUND)
+		Fac = floorf(Fac1 + 0.5f);
+	else if(type == NODE_MATH_LESS_THAN)
+		Fac = Fac1 < Fac2;
+	else if(type == NODE_MATH_GREATER_THAN)
+		Fac = Fac1 > Fac2;
+	else if(type == NODE_MATH_MODULO)
+		Fac = safe_modulo(Fac1, Fac2);
+	else if(type == NODE_MATH_ABSOLUTE)
+		Fac = fabsf(Fac1);
+	else if(type == NODE_MATH_CLAMP)
+		Fac = clamp(Fac1, 0.0f, 1.0f);
+	else
+		Fac = 0.0f;
+	
+	return Fac;
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 111d5d47988..abf75b62bd5 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -26,7 +26,8 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *s
 	float value = stack_load_float(stack, value_in);
 	
 	/* Combine, and convert back to RGB */
-	float3 color = hsv_to_rgb(make_float3(hue, saturation, value));
+	float3 color = color_srgb_to_scene_linear(
+	        hsv_to_rgb(make_float3(hue, saturation, value)));
 
 	if (stack_valid(color_out))
 		stack_store_float3(stack, color_out, color);
@@ -40,7 +41,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *
 	float3 color = stack_load_float3(stack, color_in);
 	
 	/* Convert to HSV */
-	color = rgb_to_hsv(color);
+	color = rgb_to_hsv(color_scene_linear_to_srgb(color));
 
 	if (stack_valid(hue_out))
 		stack_store_float(stack, hue_out, color.x);
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 45b08832fea..e98931b5c7b 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -684,7 +684,7 @@ void ShaderGraph::bump_from_displacement()
 	 * different shifted coordinates.
 	 *
 	 * these 3 displacement values are then fed into the bump node, which will
-	 * output the the perturbed normal. */
+	 * output the perturbed normal. */
 
 	ShaderInput *displacement_in = output()->input("Displacement");
 
@@ -844,7 +844,7 @@ void ShaderGraph::dump_graph(const char *filename)
 		return;
 	}
 
-	fprintf(fd, "digraph dependencygraph {\n");
+	fprintf(fd, "digraph shader_graph {\n");
 	fprintf(fd, "ranksep=1.5\n");
 	fprintf(fd, "splines=false\n");
 
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 03a8cd5d2d3..9b279660f9c 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -39,7 +39,6 @@ Integrator::Integrator()
 	transparent_max_bounce = max_bounce;
 	transparent_shadows = false;
 
-	volume_homogeneous_sampling = 0;
 	volume_max_steps = 1024;
 	volume_step_size = 0.1f;
 
@@ -60,6 +59,10 @@ Integrator::Integrator()
 	mesh_light_samples = 1;
 	subsurface_samples = 1;
 	volume_samples = 1;
+
+	sample_all_lights_direct = true;
+	sample_all_lights_indirect = true;
+
 	method = PATH;
 
 	sampling_pattern = SAMPLING_PATTERN_SOBOL;
@@ -189,7 +192,6 @@ bool Integrator::modified(const Integrator& integrator)
 		transparent_min_bounce == integrator.transparent_min_bounce &&
 		transparent_max_bounce == integrator.transparent_max_bounce &&
 		transparent_shadows == integrator.transparent_shadows &&
-		volume_homogeneous_sampling == integrator.volume_homogeneous_sampling &&
 		volume_max_steps == integrator.volume_max_steps &&
 		volume_step_size == integrator.volume_step_size &&
 		caustics_reflective == integrator.caustics_reflective &&
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 13c10e8ca94..110c354823b 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -39,7 +39,6 @@ public:
 	int transparent_max_bounce;
 	bool transparent_shadows;
 
-	int volume_homogeneous_sampling;
 	int volume_max_steps;
 	float volume_step_size;
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1f006637e67..8d1cec10187 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -125,6 +125,7 @@ Light::Light()
 
 	shader = 0;
 	samples = 1;
+	max_bounces = 1024;
 }
 
 void Light::tag_update(Scene *scene)
@@ -489,6 +490,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 		float3 co = light->co;
 		int shader_id = scene->shader_manager->get_shader_id(scene->lights[i]->shader);
 		float samples = __int_as_float(light->samples);
+		float max_bounces = __int_as_float(light->max_bounces);
 
 		if(!light->cast_shadow)
 			shader_id &= ~SHADER_CAST_SHADOW;
@@ -523,6 +525,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
 			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_DISTANT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -533,9 +536,8 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float area = M_PI_F*radius*radius;
 			float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
 			float3 dir = light->dir;
-			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
@@ -544,6 +546,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
 			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_BACKGROUND) {
 			uint visibility = scene->background->visibility;
@@ -572,6 +575,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
 			light_data[i*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_AREA) {
 			float3 axisu = light->axisu*(light->sizeu*light->size);
@@ -580,8 +584,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
 			float3 dir = light->dir;
 			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && area > 0.0f)
 				shader_id |= SHADER_USE_MIS;
@@ -590,6 +593,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
 			light_data[i*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
 			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
+			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_SPOT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -600,8 +604,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			float spot_smooth = (1.0f - spot_angle)*light->spot_smooth;
 			float3 dir = light->dir;
 			
-			if(len(dir) > 0.0f)
-				dir = normalize(dir);
+			dir = safe_normalize(dir);
 
 			if(light->use_mis && radius > 0.0f)
 				shader_id |= SHADER_USE_MIS;
@@ -610,6 +613,7 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
 			light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
 			light_data[i*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
 			light_data[i*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
+			light_data[i*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 	}
 	
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 89091bb5f9e..cf769ac5aed 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -58,6 +58,7 @@ public:
 
 	int shader;
 	int samples;
+	int max_bounces;
 
 	void tag_update(Scene *scene);
 };
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 42103396b53..6137f7d4fdc 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -93,6 +93,8 @@ Mesh::Mesh()
 
 	attributes.triangle_mesh = this;
 	curve_attributes.curve_mesh = this;
+
+	has_volume = false;
 }
 
 Mesh::~Mesh()
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index e8476bfac4c..62bdf7cd162 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -17,6 +17,7 @@
 #include "image.h"
 #include "nodes.h"
 #include "svm.h"
+#include "svm_math_util.h"
 #include "osl.h"
 #include "sky_model.h"
 
@@ -3669,7 +3670,7 @@ static ShaderEnum math_type_init()
 	enm.insert("Less Than", NODE_MATH_LESS_THAN);
 	enm.insert("Greater Than", NODE_MATH_GREATER_THAN);
 	enm.insert("Modulo", NODE_MATH_MODULO);
-    enm.insert("Absolute", NODE_MATH_ABSOLUTE);
+	enm.insert("Absolute", NODE_MATH_ABSOLUTE);
 
 	return enm;
 }
@@ -3682,9 +3683,24 @@ void MathNode::compile(SVMCompiler& compiler)
 	ShaderInput *value2_in = input("Value2");
 	ShaderOutput *value_out = output("Value");
 
+	compiler.stack_assign(value_out);
+
+	/* Optimize math node without links to a single value node. */
+	if(value1_in->link == NULL && value2_in->link == NULL) {
+		float optimized_value = svm_math((NodeMath)type_enum[type],
+		                                 value1_in->value.x,
+		                                 value2_in->value.x);
+		if(use_clamp) {
+			optimized_value = clamp(optimized_value, 0.0f, 1.0f);
+		}
+		compiler.add_node(NODE_VALUE_F,
+		                  __float_as_int(optimized_value),
+		                  value_out->stack_offset);
+		return;
+	}
+
 	compiler.stack_assign(value1_in);
 	compiler.stack_assign(value2_in);
-	compiler.stack_assign(value_out);
 
 	compiler.add_node(NODE_MATH, type_enum[type], value1_in->stack_offset, value2_in->stack_offset);
 	compiler.add_node(NODE_MATH, value_out->stack_offset);
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 46ddab235d9..3b2a3ae0b33 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -318,6 +318,9 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 				mtfm_pre = mtfm_pre * itfm;
 				mtfm_post = mtfm_post * itfm;
 			}
+			else {
+				flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+			}
 
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index f57e16471a1..b9180552ac2 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -248,10 +248,6 @@ void OSLShaderManager::shading_system_free()
 
 bool OSLShaderManager::osl_compile(const string& inputfile, const string& outputfile)
 {
-#if OSL_LIBRARY_VERSION_CODE < 10500
-	typedef string string_view;
-#endif
-
 	vector<string_view> options;
 	string stdosl_path;
 	string shader_path = path_get("shader");
@@ -748,11 +744,7 @@ OSL::ShadingAttribStateRef OSLCompiler::compile_type(Shader *shader, ShaderGraph
 
 	current_type = type;
 
-#if OSL_LIBRARY_VERSION_CODE >= 10501
 	OSL::ShadingAttribStateRef group = ss->ShaderGroupBegin(shader->name.c_str());
-#else
-	ss->ShaderGroupBegin(shader->name.c_str());
-#endif
 
 	ShaderNode *output = graph->output();
 	set<ShaderNode*> dependencies;
@@ -780,13 +772,7 @@ OSL::ShadingAttribStateRef OSLCompiler::compile_type(Shader *shader, ShaderGraph
 
 	ss->ShaderGroupEnd();
 
-#if OSL_LIBRARY_VERSION_CODE >= 10501
 	return group;
-#else
-	OSL::ShadingAttribStateRef group = ss->state();
-	ss->clear_state();
-	return group;
-#endif
 }
 
 void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 6c3f98bc9b0..ccb03eaf1e0 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -153,81 +153,83 @@ void Scene::device_update(Device *device_, Progress& progress)
 	progress.set_status("Updating Shaders");
 	shader_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Images");
 	image_manager->device_update(device, &dscene, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Background");
 	background->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Objects");
 	object_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Meshes");
 	mesh_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Objects Flags");
 	object_manager->device_update_flags(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Hair Systems");
 	curve_system_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	/* TODO(sergey): Make sure camera is not needed above. */
 	progress.set_status("Updating Camera");
 	camera->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lights");
 	light_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Particle Systems");
 	particle_system_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Film");
 	film->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Integrator");
 	integrator->device_update(device, &dscene, this);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Baking");
 	bake_manager->device_update(device, &dscene, this, progress);
 
-	if(progress.get_cancel()) return;
+	if(progress.get_cancel() || device->have_error()) return;
 
-	progress.set_status("Updating Device", "Writing constant memory");
-	device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
+	if(device->have_error() == false) {
+		progress.set_status("Updating Device", "Writing constant memory");
+		device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
+	}
 }
 
 Scene::MotionType Scene::need_motion(bool advanced_shading)
@@ -277,7 +279,8 @@ bool Scene::need_reset()
 		|| shader_manager->need_update
 		|| particle_system_manager->need_update
 		|| curve_system_manager->need_update
-		|| bake_manager->need_update);
+		|| bake_manager->need_update
+		|| film->need_update);
 }
 
 void Scene::reset()
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 9fcd9fa85f5..c03a3dd081d 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -251,7 +251,7 @@ void Session::run_gpu()
 			update_scene();
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			if(progress.get_cancel())
 				break;
@@ -292,7 +292,7 @@ void Session::run_gpu()
 			}
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			tiles_written = update_progressive_refine(progress.get_cancel());
 
@@ -540,7 +540,7 @@ void Session::run_cpu()
 			update_scene();
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			if(progress.get_cancel())
 				break;
@@ -558,7 +558,7 @@ void Session::run_cpu()
 				need_tonemap = true;
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 		}
 
 		device->task_wait();
@@ -580,7 +580,7 @@ void Session::run_cpu()
 			}
 
 			if(!device->error_message().empty())
-				progress.set_cancel(device->error_message());
+				progress.set_error(device->error_message());
 
 			tiles_written = update_progressive_refine(progress.get_cancel());
 		}
@@ -604,7 +604,7 @@ void Session::load_kernels()
 			if(message.empty())
 				message = "Failed loading render kernel, see console for errors";
 
-			progress.set_cancel(message);
+			progress.set_error(message);
 			progress.set_status("Error", message);
 			progress.set_update();
 			return;
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index d8925852c21..5c30d191d34 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -138,7 +138,8 @@ Shader::Shader()
 	use_mis = true;
 	use_transparent_shadow = true;
 	heterogeneous_volume = true;
-	volume_sampling_method = 0;
+	volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+	volume_interpolation_method = VOLUME_INTERPOLATION_LINEAR;
 
 	has_surface = false;
 	has_surface_transparent = false;
@@ -352,10 +353,14 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 			flag |= SD_HAS_BSSRDF_BUMP;
 		if(shader->has_converter_blackbody)
 			has_converter_blackbody = true;
-		if(shader->volume_sampling_method == 1)
+		if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
 			flag |= SD_VOLUME_EQUIANGULAR;
-		if(shader->volume_sampling_method == 2)
+		if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
 			flag |= SD_VOLUME_MIS;
+		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
+			flag |= SD_VOLUME_CUBIC;
+		if(shader->graph_bump)
+			flag |= SD_HAS_BUMP;
 
 		/* regular shader */
 		shader_flag[i++] = flag;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 368496fd188..509c9385e6d 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -18,6 +18,15 @@
 #define __SHADER_H__
 
 #ifdef WITH_OSL
+#  if defined(_MSC_VER)
+/* Prevent OSL from polluting the context with weird macros from windows.h.
+ * TODO(sergey): Ideally it's only enough to have class/struct declarations in
+ * the header and skip header include here.
+ */
+#    define NOGDI
+#    define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#  endif
 #  include <OSL/oslexec.h>
 #endif
 
@@ -44,6 +53,18 @@ enum ShadingSystem {
 	SHADINGSYSTEM_SVM
 };
 
+/* Keep those in sync with the python-defined enum. */
+enum VolumeSampling {
+	VOLUME_SAMPLING_DISTANCE = 0,
+	VOLUME_SAMPLING_EQUIANGULAR = 1,
+	VOLUME_SAMPLING_MULTIPLE_IMPORTANCE = 2,
+};
+
+enum VolumeInterpolation {
+	VOLUME_INTERPOLATION_LINEAR = 0,
+	VOLUME_INTERPOLATION_CUBIC = 1,
+};
+
 /* Shader describing the appearance of a Mesh, Light or Background.
  *
  * While there is only a single shader graph, it has three outputs: surface,
@@ -68,7 +89,8 @@ public:
 	bool use_mis;
 	bool use_transparent_shadow;
 	bool heterogeneous_volume;
-	int volume_sampling_method;
+	VolumeSampling volume_sampling_method;
+	int volume_interpolation_method;
 
 	/* synchronization */
 	bool need_update;
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index 05ff5ca4b65..6bd18d08ba0 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -117,8 +117,8 @@ void EdgeDice::stitch_triangles(Patch *patch, vector<int>& outer, vector<int>& i
 		}
 		else {
 			/* length of diagonals */
-			float len1 = len(mesh_P[inner[i]] - mesh_P[outer[j+1]]);
-			float len2 = len(mesh_P[outer[j]] - mesh_P[inner[i+1]]);
+			float len1 = len_squared(mesh_P[inner[i]] - mesh_P[outer[j+1]]);
+			float len2 = len_squared(mesh_P[outer[j]] - mesh_P[inner[i+1]]);
 
 			/* use smallest diagonal */
 			if(len1 < len2)
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 842d5efac79..a07deb68b15 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -10,7 +10,6 @@ set(INC_SYS
 
 set(SRC
 	util_cache.cpp
-	util_dynlib.cpp
 	util_logging.cpp
 	util_md5.cpp
 	util_path.cpp
@@ -31,10 +30,10 @@ endif()
 set(SRC_HEADERS
 	util_algorithm.h
 	util_args.h
+	util_atomic.h
 	util_boundbox.h
 	util_cache.h
 	util_debug.h
-	util_dynlib.h
 	util_foreach.h
 	util_function.h
 	util_half.h
diff --git a/intern/cycles/util/util_dynlib.h b/intern/cycles/util/util_atomic.h
index b30cf98c1b9..1bbb0a86e23 100644
--- a/intern/cycles/util/util_dynlib.h
+++ b/intern/cycles/util/util_atomic.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2014 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,20 @@
  * limitations under the License
  */
 
-#ifndef __UTIL_DYNLIB_H__
-#define __UTIL_DYNLIB_H__
+#ifndef __UTIL_ATOMIC_H__
+#define __UTIL_ATOMIC_H__
 
-CCL_NAMESPACE_BEGIN
+/* Using atomic ops header from Blender. */
+#include "atomic_ops.h"
 
-struct DynamicLibrary;
-
-DynamicLibrary *dynamic_library_open(const char *name);
-void *dynamic_library_find(DynamicLibrary *lib, const char *name);
-void dynamic_library_close(DynamicLibrary *lib);
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_DYNLIB_H__ */
+ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
+{
+	size_t prev_value = *maximum_value;
+	while (prev_value < value) {
+		if (atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
+			break;
+		}
+	}
+}
 
+#endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_dynlib.cpp b/intern/cycles/util/util_dynlib.cpp
deleted file mode 100644
index 587cad607c8..00000000000
--- a/intern/cycles/util/util_dynlib.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#include <stdlib.h>
-
-#include "util_dynlib.h"
-
-#ifdef _WIN32
-
-#include <windows.h>
-
-CCL_NAMESPACE_BEGIN
-
-struct DynamicLibrary {
-	HMODULE module;
-};
-
-DynamicLibrary *dynamic_library_open(const char *name)
-{
-	HMODULE module = LoadLibrary(name);
-
-	if(!module)
-		return NULL;
-
-	DynamicLibrary *lib = new DynamicLibrary();
-	lib->module = module;
-
-	return lib;
-}
-
-void *dynamic_library_find(DynamicLibrary *lib, const char *name)
-{
-	return (void*)GetProcAddress(lib->module, name);
-}
-
-void dynamic_library_close(DynamicLibrary *lib)
-{
-	FreeLibrary(lib->module);
-	delete lib;
-}
-
-CCL_NAMESPACE_END
-
-#else
-
-#include <dlfcn.h>
-
-CCL_NAMESPACE_BEGIN
-
-struct DynamicLibrary {
-	void *module;
-};
-
-DynamicLibrary *dynamic_library_open(const char *name)
-{
-	void *module = dlopen(name, RTLD_NOW);
-
-	if(!module)
-		return NULL;
-
-	DynamicLibrary *lib = new DynamicLibrary();
-	lib->module = module;
-
-	return lib;
-}
-
-void *dynamic_library_find(DynamicLibrary *lib, const char *name)
-{
-	return dlsym(lib->module, name);
-}
-
-void dynamic_library_close(DynamicLibrary *lib)
-{
-	dlclose(lib->module);
-	delete lib;
-}
-
-CCL_NAMESPACE_END
-
-#endif
-
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 991789e7460..2c5455051a4 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -43,7 +43,7 @@ public:
 
 #endif
 
-class float3;
+struct float3;
 
 std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c332e1709db..78005546a01 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -314,6 +314,12 @@ ccl_device_inline float2 normalize_len(const float2 a, float *t)
 	return a/(*t);
 }
 
+ccl_device_inline float2 safe_normalize(const float2 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
 	return (a.x == b.x && a.y == b.y);
@@ -510,6 +516,12 @@ ccl_device_inline float3 normalize_len(const float3 a, float *t)
 	return a/(*t);
 }
 
+ccl_device_inline float3 safe_normalize(const float3 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 #ifndef __KERNEL_OPENCL__
 
 ccl_device_inline bool operator==(const float3 a, const float3 b)
@@ -817,6 +829,12 @@ ccl_device_inline float4 normalize(const float4 a)
 	return a/len(a);
 }
 
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+	float t = len(a);
+	return (t)? a/t: a;
+}
+
 ccl_device_inline float4 min(float4 a, float4 b)
 {
 #ifdef __KERNEL_SSE__
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index add0d18c742..b2a32c45287 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -152,8 +152,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
 #define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + F(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + F(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	/* Do the following 16 operations. */
 	SET(a, b, c, d,  0,  7,  T1);
 	SET(d, a, b, c,  1, 12,  T2);
@@ -178,8 +178,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
 #define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + G(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + G(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	 /* Do the following 16 operations. */
 	SET(a, b, c, d,  1,  5, T17);
 	SET(d, a, b, c,  6,  9, T18);
@@ -230,8 +230,8 @@ void MD5Hash::process(const uint8_t *data /*[64]*/)
 	 * a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
 #define I(x, y, z) ((y) ^ ((x) | ~(z)))
 #define SET(a, b, c, d, k, s, Ti)\
-  t = a + I(b,c,d) + X[k] + Ti;\
-  a = ROTATE_LEFT(t, s) + b
+	t = a + I(b,c,d) + X[k] + Ti;\
+	a = ROTATE_LEFT(t, s) + b
 	 /* Do the following 16 operations. */
 	SET(a, b, c, d,  0,  6, T49);
 	SET(d, a, b, c,  7, 10, T50);
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 2feb3d6ab7e..fba8b1105f3 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -130,6 +130,9 @@
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
+#define NOGDI
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 
 #endif
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 85d19b6a325..aa424045ece 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -41,21 +41,12 @@ static string cached_user_path = "";
 
 static boost::filesystem::path to_boost(const string& path)
 {
-#ifdef _MSC_VER
-	std::wstring path_utf16 = Strutil::utf8_to_utf16(path.c_str());
-	return boost::filesystem::path(path_utf16.c_str());
-#else
 	return boost::filesystem::path(path.c_str());
-#endif
 }
 
 static string from_boost(const boost::filesystem::path& path)
 {
-#ifdef _MSC_VER
-	return Strutil::utf16_to_utf8(path.wstring().c_str());
-#else
 	return path.string().c_str();
-#endif
 }
 
 void path_init(const string& path, const string& user_path)
@@ -259,14 +250,7 @@ string path_source_replace_includes(const string& source_, const string& path)
 
 FILE *path_fopen(const string& path, const string& mode)
 {
-#ifdef _WIN32
-	std::wstring path_utf16 = Strutil::utf8_to_utf16(path);
-	std::wstring mode_utf16 = Strutil::utf8_to_utf16(mode);
-
-	return _wfopen(path_utf16.c_str(), mode_utf16.c_str());
-#else
 	return fopen(path.c_str(), mode.c_str());
-#endif
 }
 
 void path_cache_clear_except(const string& name, const set<string>& except)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index e721a3f5047..238fb976778 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -46,6 +46,8 @@ public:
 		update_cb = NULL;
 		cancel = false;
 		cancel_message = "";
+		error = false;
+		error_message = "";
 		cancel_cb = NULL;
 	}
 
@@ -79,6 +81,8 @@ public:
 		sync_substatus = "";
 		cancel = false;
 		cancel_message = "";
+		error = false;
+		error_message = "";
 	}
 
 	/* cancel */
@@ -108,6 +112,28 @@ public:
 		cancel_cb = function;
 	}
 
+	/* error */
+	void set_error(const string& error_message_)
+	{
+		thread_scoped_lock lock(progress_mutex);
+		error_message = error_message_;
+		error = true;
+		/* If error happens we also stop rendering. */
+		cancel_message = error_message_;
+		cancel = true;
+	}
+
+	bool get_error()
+	{
+		return error;
+	}
+
+	string get_error_message()
+	{
+		thread_scoped_lock lock(progress_mutex);
+		return error_message;
+	}
+
 	/* tile and timing information */
 
 	void set_start_time(double start_time_)
@@ -259,6 +285,9 @@ protected:
 
 	volatile bool cancel;
 	string cancel_message;
+
+	volatile bool error;
+	string error_message;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index f4236cc616e..5e452ea03b4 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -151,7 +151,7 @@ __forceinline ssef maxi(const ssef& a, const ssef& b) {
 /// Ternary Operators
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_AVX2__) && !defined(_MSC_VER) // see T41066
 __forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmadd_ps(a,b,c); }
 __forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmsub_ps(a,b,c); }
 __forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return _mm_fnmadd_ps(a,b,c); }
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 8758b823084..fe6c162366e 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -17,6 +17,8 @@
 #ifndef __UTIL_STATS_H__
 #define __UTIL_STATS_H__
 
+#include "util_atomic.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Stats {
@@ -24,14 +26,13 @@ public:
 	Stats() : mem_used(0), mem_peak(0) {}
 
 	void mem_alloc(size_t size) {
-		mem_used += size;
-		if(mem_used > mem_peak)
-			mem_peak = mem_used;
+		atomic_add_z(&mem_used, size);
+		atomic_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
 		assert(mem_used >= size);
-		mem_used -= size;
+		atomic_sub_z(&mem_used, size);
 	}
 
 	size_t mem_used;
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 2a199e591bf..ce84200d0b6 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -33,11 +33,7 @@
 
 #ifndef __KERNEL_GPU__
 
-#  ifdef NDEBUG
-#    define ccl_device static inline
-#  else
-#    define ccl_device static
-#  endif
+#define ccl_device static inline
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
@@ -53,11 +49,7 @@
 #define ccl_try_align(...) /* not support for function arguments (error C2719) */
 #endif
 #define ccl_may_alias
-#  ifdef NDEBUG
-#    define ccl_always_inline __forceinline
-#  else
-#    define ccl_always_inline
-#  endif
+#define ccl_always_inline __forceinline
 #define ccl_maybe_unused
 
 #else
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index cc6e8a371ed..3d885691c92 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -107,9 +107,6 @@ public:
 		if(datasize > 0) {
 			data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
 			memcpy(data, &from[0], datasize*sizeof(T));
-			free_aligned(data);
-			data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
-			memcpy(data, &from[0], datasize*sizeof(T));
 		}
 
 		return *this;