134 files changed, 7194 insertions, 3542 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 3b410b2a1e1..97854a88e84 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -146,6 +146,14 @@ if(WITH_CYCLES_OSL)
 	)
 endif()
 
+if(WITH_CYCLES_OPENSUBDIV)
+	add_definitions(-DWITH_OPENSUBDIV)
+	include_directories(
+		SYSTEM
+		${OPENSUBDIV_INCLUDE_DIR}
+	)
+endif()
+
 set(WITH_CYCLES_DEVICE_OPENCL TRUE)
 set(WITH_CYCLES_DEVICE_CUDA TRUE)
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 3aca46e2dc7..3d3aca33881 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -35,7 +35,6 @@
 #include "shader.h"
 #include "scene.h"
 
-#include "subd_mesh.h"
 #include "subd_patch.h"
 #include "subd_split.h"
 
@@ -417,6 +416,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 	xml_read_int_array(verts, node, "verts");
 	xml_read_int_array(nverts, node, "nverts");
 
+#if 0
 	if(xml_equal_string(node, "subdivision", "catmull-clark")) {
 		/* create subd mesh */
 		SubdMesh sdmesh;
@@ -460,7 +460,9 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		DiagSplit dsplit(sdparams);
 		sdmesh.tessellate(&dsplit);
 	}
-	else {
+	else
+#endif
+	{
 		/* create vertices */
 		mesh->verts = P;
 
@@ -568,7 +570,7 @@ static void xml_read_patch(const XMLReadState& state, pugi::xml_node node)
 		mesh->used_shaders.push_back(state.shader);
 
 		/* split */
-		SubdParams sdparams(mesh, 0, state.smooth);
+		SubdParams sdparams(mesh);
 		xml_read_float(&sdparams.dicing_rate, node, "dicing_rate");
 
 		DiagSplit dsplit(sdparams);
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index d4b7535b9ee..2c5365c9189 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -82,7 +82,6 @@ def init():
     import bpy
     import _cycles
     import os.path
-    import sys
 
     # Workaround possibly buggy legacy drivers which crashes on the OpenCL
     # device enumeration.
@@ -103,10 +102,12 @@ def init():
     _cycles.init(path, user_path, bpy.app.background)
     _parse_command_line()
 
+
 def exit():
     import _cycles
     _cycles.exit()
 
+
 def create(engine, data, scene, region=None, v3d=None, rv3d=None, preview_osl=False):
     import bpy
     import _cycles
diff --git a/intern/cycles/blender/addon/osl.py b/intern/cycles/blender/addon/osl.py
index f4aaaab5eab..19f2ecc9d1a 100644
--- a/intern/cycles/blender/addon/osl.py
+++ b/intern/cycles/blender/addon/osl.py
@@ -41,6 +41,8 @@ def update_script_node(node, report):
     import shutil
     import tempfile
 
+    oso_file_remove = False
+
     if node.mode == 'EXTERNAL':
         # compile external script file
         script_path = bpy.path.abspath(node.filepath, library=node.id_data.library)
@@ -49,7 +51,6 @@ def update_script_node(node, report):
         if script_ext == ".oso":
             # it's a .oso file, no need to compile
             ok, oso_path = True, script_path
-            oso_file_remove = False
         elif script_ext == ".osl":
             # compile .osl file
             ok, oso_path = osl_compile(script_path, report)
@@ -65,7 +66,6 @@ def update_script_node(node, report):
         elif os.path.dirname(node.filepath) == "":
             # module in search path
             oso_path = node.filepath
-            oso_file_remove = False
             ok = True
         else:
             # unknown
@@ -88,12 +88,10 @@ def update_script_node(node, report):
             osl_file.close()
 
             ok, oso_path = osl_compile(osl_file.name, report)
-            oso_file_remove = False
             os.remove(osl_file.name)
         else:
             # compile text datablock from disk directly
             ok, oso_path = osl_compile(osl_path, report)
-            oso_file_remove = False
 
         if ok:
             # read bytecode
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0b3dd552f62..8e82eac2b59 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -46,12 +46,6 @@ enum_displacement_methods = (
     ('BOTH', "Both", "Combination of displacement and bump mapping"),
     )
 
-enum_subdivision_types = (
-    ('NONE', "None", "No subdivision"),
-    ('LINEAR', "Linear", "Use linear subdivision"),
-    ('CATMULL_CLARK', "Catmull–Clark", "Use Catmull-Clark subdivision"),
-    )
-
 enum_bvh_types = (
     ('DYNAMIC_BVH', "Dynamic BVH", "Objects can be individually updated, at the cost of slower render time"),
     ('STATIC_BVH', "Static BVH", "Any object modification requires a complete BVH rebuild, but renders faster"),
@@ -781,6 +775,13 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
                 default='LINEAR',
                 )
 
+        cls.displacement_method = EnumProperty(
+                name="Displacement Method",
+                description="Method to use for the displacement",
+                items=enum_displacement_methods,
+                default='BUMP',
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Material.cycles
@@ -958,25 +959,6 @@ class CyclesMeshSettings(bpy.types.PropertyGroup):
                 type=cls,
                 )
 
-        cls.displacement_method = EnumProperty(
-                name="Displacement Method",
-                description="Method to use for the displacement",
-                items=enum_displacement_methods,
-                default='BUMP',
-                )
-        cls.subdivision_type = EnumProperty(
-                name="Subdivision Type",
-                description="Type of subdivision to use",
-                items=enum_subdivision_types,
-                default='NONE',
-                )
-        cls.dicing_rate = FloatProperty(
-                name="Dicing Rate",
-                description="Multiplier for scene dicing rate",
-                min=0.1, max=1000.0,
-                default=1.0,
-                )
-
     @classmethod
     def unregister(cls):
         del bpy.types.Mesh.cycles
@@ -984,11 +966,9 @@ class CyclesMeshSettings(bpy.types.PropertyGroup):
         del bpy.types.MetaBall.cycles
 
 
-class CyclesObjectBlurSettings(bpy.types.PropertyGroup):
-
+class CyclesObjectSettings(bpy.types.PropertyGroup):
     @classmethod
     def register(cls):
-
         bpy.types.Object.cycles = PointerProperty(
                 name="Cycles Object Settings",
                 description="Cycles object settings",
@@ -1020,6 +1000,19 @@ class CyclesObjectBlurSettings(bpy.types.PropertyGroup):
                 default=False,
                 )
 
+        cls.use_adaptive_subdivision = BoolProperty(
+                name="Use Adaptive Subdivision",
+                description="Use adaptive render time subdivision",
+                default=False,
+                )
+
+        cls.dicing_rate = FloatProperty(
+                name="Dicing Rate",
+                description="Multiplier for scene dicing rate",
+                min=0.1, max=1000.0,
+                default=1.0,
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Object.cycles
@@ -1136,6 +1129,7 @@ def register():
     bpy.utils.register_class(CyclesWorldSettings)
     bpy.utils.register_class(CyclesVisibilitySettings)
     bpy.utils.register_class(CyclesMeshSettings)
+    bpy.utils.register_class(CyclesObjectSettings)
     bpy.utils.register_class(CyclesCurveRenderSettings)
     bpy.utils.register_class(CyclesCurveSettings)
 
@@ -1147,6 +1141,7 @@ def unregister():
     bpy.utils.unregister_class(CyclesLampSettings)
     bpy.utils.unregister_class(CyclesWorldSettings)
     bpy.utils.unregister_class(CyclesMeshSettings)
+    bpy.utils.unregister_class(CyclesObjectSettings)
     bpy.utils.unregister_class(CyclesVisibilitySettings)
     bpy.utils.unregister_class(CyclesCurveRenderSettings)
     bpy.utils.unregister_class(CyclesCurveSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 6656beb4478..52872d2b83f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -674,48 +674,6 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
             split.separator()
 
 
-class Cycles_PT_mesh_displacement(CyclesButtonsPanel, Panel):
-    bl_label = "Displacement"
-    bl_context = "data"
-
-    @classmethod
-    def poll(cls, context):
-        if CyclesButtonsPanel.poll(context):
-            if context.mesh or context.curve or context.meta_ball:
-                if context.scene.cycles.feature_set == 'EXPERIMENTAL':
-                    return True
-
-        return False
-
-    def draw(self, context):
-        layout = self.layout
-
-        mesh = context.mesh
-        curve = context.curve
-        mball = context.meta_ball
-
-        if mesh:
-            cdata = mesh.cycles
-        elif curve:
-            cdata = curve.cycles
-        elif mball:
-            cdata = mball.cycles
-
-        split = layout.split()
-
-        col = split.column()
-        sub = col.column(align=True)
-        sub.label(text="Displacement:")
-        sub.prop(cdata, "displacement_method", text="")
-
-        col = split.column()
-        sub = col.column(align=True)
-        sub.label(text="Subdivision:")
-        sub.prop(cdata, "subdivision_type", text="")
-
-        if cdata.subdivision_type != 'NONE':
-            sub.prop(cdata, "dicing_rate")
-
 class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
     bl_label = "Motion Blur"
     bl_context = "object"
@@ -895,7 +853,7 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
 
         lamp = context.lamp
         clamp = lamp.cycles
-        cscene = context.scene.cycles
+        # cscene = context.scene.cycles
 
         layout.prop(lamp, "type", expand=True)
 
@@ -1115,7 +1073,7 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
 
         world = context.world
         cworld = world.cycles
-        cscene = context.scene.cycles
+        # cscene = context.scene.cycles
 
         split = layout.split()
 
@@ -1227,6 +1185,11 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         col.prop(cmat, "sample_as_light", text="Multiple Importance")
         col.prop(cmat, "use_transparent_shadow")
 
+        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+            col.separator()
+            col.label(text="Displacement:")
+            col.prop(cmat, "displacement_method", text="")
+
         col = split.column()
         col.label(text="Volume:")
         sub = col.column()
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 221b1437128..830723d6149 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -104,7 +104,6 @@ def vector_curve_node_remap(node):
     """
     Remap values of vector curve node from normalized to absolute values
     """
-    from mathutils import Vector
     if node.bl_idname == 'ShaderNodeVectorCurve':
         node.mapping.use_clip = False
         for curve in node.mapping.curves:
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index ec11a893b5a..5d63aaa1415 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -24,7 +24,6 @@
 #include "blender_session.h"
 #include "blender_util.h"
 
-#include "subd_mesh.h"
 #include "subd_patch.h"
 #include "subd_split.h"
 
@@ -335,44 +334,71 @@ static void attr_create_vertex_color(Scene *scene,
                                      Mesh *mesh,
                                      BL::Mesh& b_mesh,
                                      const vector<int>& nverts,
-                                     const vector<int>& face_flags)
+                                     const vector<int>& face_flags,
+                                     bool subdivision)
 {
-	BL::Mesh::tessface_vertex_colors_iterator l;
-	for(b_mesh.tessface_vertex_colors.begin(l); l != b_mesh.tessface_vertex_colors.end(); ++l) {
-		if(!mesh->need_attribute(scene, ustring(l->name().c_str())))
-			continue;
+	if(subdivision) {
+		BL::Mesh::vertex_colors_iterator l;
 
-		Attribute *attr = mesh->attributes.add(
-			ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+		for(b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
+			if(!mesh->need_attribute(scene, ustring(l->name().c_str())))
+				continue;
 
-		BL::MeshColorLayer::data_iterator c;
-		uchar4 *cdata = attr->data_uchar4();
-		size_t i = 0;
+			Attribute *attr = mesh->subd_attributes.add(ustring(l->name().c_str()),
+			                                            TypeDesc::TypeColor,
+			                                            ATTR_ELEMENT_CORNER_BYTE);
 
-		for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
-			int tri_a[3], tri_b[3];
-			face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+			BL::Mesh::polygons_iterator p;
+			uchar4 *cdata = attr->data_uchar4();
 
-			uchar4 colors[4];
-			colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
-			colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
-			colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
-			if(nverts[i] == 4) {
-				colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+			for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+				int n = p->loop_total();
+				for(int i = 0; i < n; i++) {
+					float3 color = get_float3(l->data[p->loop_start() + i].color());
+					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color));
+				}
 			}
+		}
+	}
+	else {
+		BL::Mesh::tessface_vertex_colors_iterator l;
+		for(b_mesh.tessface_vertex_colors.begin(l); l != b_mesh.tessface_vertex_colors.end(); ++l) {
+			if(!mesh->need_attribute(scene, ustring(l->name().c_str())))
+				continue;
+
+			Attribute *attr = mesh->attributes.add(ustring(l->name().c_str()),
+			                                       TypeDesc::TypeColor,
+			                                       ATTR_ELEMENT_CORNER_BYTE);
+
+			BL::MeshColorLayer::data_iterator c;
+			uchar4 *cdata = attr->data_uchar4();
+			size_t i = 0;
+
+			for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
+				int tri_a[3], tri_b[3];
+				face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+
+				uchar4 colors[4];
+				colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
+				colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
+				colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+				if(nverts[i] == 4) {
+					colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+				}
 
-			cdata[0] = colors[tri_a[0]];
-			cdata[1] = colors[tri_a[1]];
-			cdata[2] = colors[tri_a[2]];
+				cdata[0] = colors[tri_a[0]];
+				cdata[1] = colors[tri_a[1]];
+				cdata[2] = colors[tri_a[2]];
 
-			if(nverts[i] == 4) {
-				cdata[3] = colors[tri_b[0]];
-				cdata[4] = colors[tri_b[1]];
-				cdata[5] = colors[tri_b[2]];
-				cdata += 6;
+				if(nverts[i] == 4) {
+					cdata[3] = colors[tri_b[0]];
+					cdata[4] = colors[tri_b[1]];
+					cdata[5] = colors[tri_b[2]];
+					cdata += 6;
+				}
+				else
+					cdata += 3;
 			}
-			else
-				cdata += 3;
 		}
 	}
 }
@@ -382,9 +408,45 @@ static void attr_create_uv_map(Scene *scene,
                                Mesh *mesh,
                                BL::Mesh& b_mesh,
                                const vector<int>& nverts,
-                               const vector<int>& face_flags)
+                               const vector<int>& face_flags,
+                               bool subdivision,
+                               bool subdivide_uvs)
 {
-	if(b_mesh.tessface_uv_textures.length() != 0) {
+	if(subdivision) {
+		BL::Mesh::uv_layers_iterator l;
+		int i = 0;
+
+		for(b_mesh.uv_layers.begin(l); l != b_mesh.uv_layers.end(); ++l, ++i) {
+			bool active_render = b_mesh.uv_textures[i].active_render();
+			AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE;
+			ustring name = ustring(l->name().c_str());
+
+			/* UV map */
+			if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
+				Attribute *attr;
+
+				if(active_render)
+					attr = mesh->subd_attributes.add(std, name);
+				else
+					attr = mesh->subd_attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+
+				if(subdivide_uvs) {
+					attr->flags |= ATTR_SUBDIVIDED;
+				}
+
+				BL::Mesh::polygons_iterator p;
+				float3 *fdata = attr->data_float3();
+
+				for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+					int n = p->loop_total();
+					for(int j = 0; j < n; j++) {
+						*(fdata++) = get_float3(l->data[p->loop_start() + j].uv());
+					}
+				}
+			}
+		}
+	}
+	else if(b_mesh.tessface_uv_textures.length() != 0) {
 		BL::Mesh::tessface_uv_textures_iterator l;
 
 		for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
@@ -465,11 +527,13 @@ static void attr_create_uv_map(Scene *scene,
 /* Create vertex pointiness attributes. */
 static void attr_create_pointiness(Scene *scene,
                                    Mesh *mesh,
-                                   BL::Mesh& b_mesh)
+                                   BL::Mesh& b_mesh,
+                                   bool subdivision)
 {
 	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
 		const int numverts = b_mesh.vertices.length();
-		Attribute *attr = mesh->attributes.add(ATTR_STD_POINTINESS);
+		AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+		Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
 		float *data = attr->data_float();
 		int *counter = new int[numverts];
 		float *raw_data = new float[numverts];
@@ -532,30 +596,45 @@ static void attr_create_pointiness(Scene *scene,
 static void create_mesh(Scene *scene,
                         Mesh *mesh,
                         BL::Mesh& b_mesh,
-                        const vector<Shader*>& used_shaders)
+                        const vector<Shader*>& used_shaders,
+                        bool subdivision=false,
+                        bool subdivide_uvs=true)
 {
 	/* count vertices and faces */
 	int numverts = b_mesh.vertices.length();
-	int numfaces = b_mesh.tessfaces.length();
+	int numfaces = (!subdivision) ? b_mesh.tessfaces.length() : b_mesh.polygons.length();
 	int numtris = 0;
+	int numcorners = 0;
+	int numngons = 0;
 	bool use_loop_normals = b_mesh.use_auto_smooth();
 
 	BL::Mesh::vertices_iterator v;
 	BL::Mesh::tessfaces_iterator f;
+	BL::Mesh::polygons_iterator p;
 
-	for(b_mesh.tessfaces.begin(f); f != b_mesh.tessfaces.end(); ++f) {
-		int4 vi = get_int4(f->vertices_raw());
-		numtris += (vi[3] == 0)? 1: 2;
+	if(!subdivision) {
+		for(b_mesh.tessfaces.begin(f); f != b_mesh.tessfaces.end(); ++f) {
+			int4 vi = get_int4(f->vertices_raw());
+			numtris += (vi[3] == 0)? 1: 2;
+		}
+	}
+	else {
+		for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+			numngons += (p->loop_total() == 4)? 0: 1;
+			numcorners += p->loop_total();
+		}
 	}
 
 	/* allocate memory */
 	mesh->reserve_mesh(numverts, numtris);
+	mesh->reserve_subd_faces(numfaces, numngons, numcorners);
 
 	/* create vertex coordinates and normals */
 	for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v)
 		mesh->add_vertex(get_float3(v->co()));
 
-	Attribute *attr_N = mesh->attributes.add(ATTR_STD_VERTEX_NORMAL);
+	AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+	Attribute *attr_N = attributes.add(ATTR_STD_VERTEX_NORMAL);
 	float3 *N = attr_N->data_float3();
 
 	for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++N)
@@ -564,7 +643,8 @@ static void create_mesh(Scene *scene,
 
 	/* create generated coordinates from undeformed coordinates */
 	if(mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
-		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
+		Attribute *attr = attributes.add(ATTR_STD_GENERATED);
+		attr->flags |= ATTR_SUBDIVIDED;
 
 		float3 loc, size;
 		mesh_texture_space(b_mesh, loc, size);
@@ -577,67 +657,103 @@ static void create_mesh(Scene *scene,
 	}
 
 	/* Create needed vertex attributes. */
-	attr_create_pointiness(scene, mesh, b_mesh);
+	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
 	int fi = 0;
 
-	for(b_mesh.tessfaces.begin(f); f != b_mesh.tessfaces.end(); ++f, ++fi) {
-		int4 vi = get_int4(f->vertices_raw());
-		int n = (vi[3] == 0)? 3: 4;
-		int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
-		bool smooth = f->use_smooth() || use_loop_normals;
-
-		/* split vertices if normal is different
-		 *
-		 * note all vertex attributes must have been set here so we can split
-		 * and copy attributes in split_vertex without remapping later */
-		if(use_loop_normals) {
-			BL::Array<float, 12> loop_normals = f->split_normals();
-
-			for(int i = 0; i < n; i++) {
-				float3 loop_N = make_float3(loop_normals[i * 3], loop_normals[i * 3 + 1], loop_normals[i * 3 + 2]);
-
-				if(N[vi[i]] != loop_N) {
-					int new_vi = mesh->split_vertex(vi[i]);
-
-					/* set new normal and vertex index */
-					N = attr_N->data_float3();
-					N[new_vi] = loop_N;
-					vi[i] = new_vi;
+	if(!subdivision) {
+		for(b_mesh.tessfaces.begin(f); f != b_mesh.tessfaces.end(); ++f, ++fi) {
+			int4 vi = get_int4(f->vertices_raw());
+			int n = (vi[3] == 0)? 3: 4;
+			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
+			bool smooth = f->use_smooth() || use_loop_normals;
+
+			/* split vertices if normal is different
+			 *
+			 * note all vertex attributes must have been set here so we can split
+			 * and copy attributes in split_vertex without remapping later */
+			if(use_loop_normals) {
+				BL::Array<float, 12> loop_normals = f->split_normals();
+
+				for(int i = 0; i < n; i++) {
+					float3 loop_N = make_float3(loop_normals[i * 3], loop_normals[i * 3 + 1], loop_normals[i * 3 + 2]);
+
+					if(N[vi[i]] != loop_N) {
+						int new_vi = mesh->split_vertex(vi[i]);
+
+						/* set new normal and vertex index */
+						N = attr_N->data_float3();
+						N[new_vi] = loop_N;
+						vi[i] = new_vi;
+					}
 				}
 			}
-		}
 
-		/* create triangles */
-		if(n == 4) {
-			if(is_zero(cross(mesh->verts[vi[1]] - mesh->verts[vi[0]], mesh->verts[vi[2]] - mesh->verts[vi[0]])) ||
-			   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
-			{
-				// TODO(mai): order here is probably wrong
-				mesh->add_triangle(vi[0], vi[1], vi[3], shader, smooth, true);
-				mesh->add_triangle(vi[2], vi[3], vi[1], shader, smooth, true);
-				face_flags[fi] |= FACE_FLAG_DIVIDE_24;
+			/* create triangles */
+			if(n == 4) {
+				if(is_zero(cross(mesh->verts[vi[1]] - mesh->verts[vi[0]], mesh->verts[vi[2]] - mesh->verts[vi[0]])) ||
+				   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
+				{
+					mesh->add_triangle(vi[0], vi[1], vi[3], shader, smooth);
+					mesh->add_triangle(vi[2], vi[3], vi[1], shader, smooth);
+					face_flags[fi] |= FACE_FLAG_DIVIDE_24;
+				}
+				else {
+					mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth);
+					mesh->add_triangle(vi[0], vi[2], vi[3], shader, smooth);
+					face_flags[fi] |= FACE_FLAG_DIVIDE_13;
+				}
 			}
 			else {
-				mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth, true);
-				mesh->add_triangle(vi[0], vi[2], vi[3], shader, smooth, true);
-				face_flags[fi] |= FACE_FLAG_DIVIDE_13;
+				mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth);
 			}
+
+			nverts[fi] = n;
 		}
-		else
-			mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth, false);
+	}
+	else {
+		vector<int> vi;
+
+		for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+			int n = p->loop_total();
+			int shader = clamp(p->material_index(), 0, used_shaders.size()-1);
+			bool smooth = p->use_smooth() || use_loop_normals;
 
-		nverts[fi] = n;
+			vi.reserve(n);
+			for(int i = 0; i < n; i++) {
+				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
+
+				/* split vertices if normal is different
+				 *
+				 * note all vertex attributes must have been set here so we can split
+				 * and copy attributes in split_vertex without remapping later */
+				if(use_loop_normals) {
+					float3 loop_N = get_float3(b_mesh.loops[p->loop_start() + i].normal());
+
+					if(N[vi[i]] != loop_N) {
+						int new_vi = mesh->split_vertex(vi[i]);
+
+						/* set new normal and vertex index */
+						N = attr_N->data_float3();
+						N[new_vi] = loop_N;
+						vi[i] = new_vi;
+					}
+				}
+			}
+
+			/* create subd faces */
+			mesh->add_subd_face(&vi[0], n, shader, smooth);
+		}
 	}
 
 	/* Create all needed attributes.
 	 * The calculate functions will check whether they're needed or not.
 	 */
-	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags);
-	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags);
+	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
+	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);
 
 	/* for volume objects, create a matrix to transform from object space to
 	 * mesh texture space. this does not work with deformations but that can
@@ -657,16 +773,44 @@ static void create_subd_mesh(Scene *scene,
                              Mesh *mesh,
                              BL::Object& b_ob,
                              BL::Mesh& b_mesh,
-                             PointerRNA *cmesh,
                              const vector<Shader*>& used_shaders,
                              float dicing_rate,
                              int max_subdivisions)
 {
-	Mesh basemesh;
-	create_mesh(scene, &basemesh, b_mesh, used_shaders);
+	BL::SubsurfModifier subsurf_mod(b_ob.modifiers[b_ob.modifiers.length()-1]);
+	bool subdivide_uvs = subsurf_mod.use_subsurf_uv();
+
+	create_mesh(scene, mesh, b_mesh, used_shaders, true, subdivide_uvs);
+
+	/* export creases */
+	size_t num_creases = 0;
+	BL::Mesh::edges_iterator e;
+
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e) {
+		if(e->crease() != 0.0f) {
+			num_creases++;
+		}
+	}
+
+	mesh->subd_creases.resize(num_creases);
 
-	SubdParams sdparams(mesh, 0, true, false);
-	sdparams.dicing_rate = max(0.1f, RNA_float_get(cmesh, "dicing_rate") * dicing_rate);
+	Mesh::SubdEdgeCrease* crease = mesh->subd_creases.data();
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e) {
+		if(e->crease() != 0.0f) {
+			crease->v[0] = e->vertices()[0];
+			crease->v[1] = e->vertices()[1];
+			crease->crease = e->crease();
+
+			crease++;
+		}
+	}
+
+	/* set subd params */
+	SubdParams sdparams(mesh);
+
+	PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles");
+
+	sdparams.dicing_rate = max(0.1f, RNA_float_get(&cobj, "dicing_rate") * dicing_rate);
 	sdparams.max_level = max_subdivisions;
 
 	scene->camera->update();
@@ -675,7 +819,7 @@ static void create_subd_mesh(Scene *scene,
 
 	/* tesselate */
 	DiagSplit dsplit(sdparams);
-	basemesh.tessellate(&dsplit);
+	mesh->tessellate(&dsplit);
 }
 
 /* Sync */
@@ -793,8 +937,6 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	mesh_synced.insert(mesh);
 
 	/* create derived mesh */
-	PointerRNA cmesh = RNA_pointer_get(&b_ob_data.ptr, "cycles");
-
 	array<int> oldtriangle = mesh->triangles;
 	
 	/* compares curve_keys rather than strands in order to handle quick hair
@@ -817,20 +959,41 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 			b_ob.update_from_editmode();
 
 		bool need_undeformed = mesh->need_attribute(scene, ATTR_STD_GENERATED);
-		BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, need_undeformed);
+
+		mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
+
+		PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles");
+
+		if(cobj.data && b_ob.modifiers.length() > 0 && experimental) {
+			BL::Modifier mod = b_ob.modifiers[b_ob.modifiers.length()-1];
+			bool enabled = preview ? mod.show_viewport() : mod.show_render();
+
+			if(enabled && mod.type() == BL::Modifier::type_SUBSURF && RNA_int_get(&cobj, "use_adaptive_subdivision")) {
+				BL::SubsurfModifier subsurf(mod);
+
+				if(subsurf.subdivision_type() == BL::SubsurfModifier::subdivision_type_CATMULL_CLARK) {
+					mesh->subdivision_type = Mesh::SUBDIVISION_CATMULL_CLARK;
+				}
+				else {
+					mesh->subdivision_type = Mesh::SUBDIVISION_LINEAR;
+				}
+			}
+		}
+
+		BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, need_undeformed, mesh->subdivision_type);
 
 		if(b_mesh) {
 			if(render_layer.use_surfaces && !hide_tris) {
-				if(cmesh.data && experimental && RNA_enum_get(&cmesh, "subdivision_type"))
-					create_subd_mesh(scene, mesh, b_ob, b_mesh, &cmesh, used_shaders,
+				if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE)
+					create_subd_mesh(scene, mesh, b_ob, b_mesh, used_shaders,
 					                 dicing_rate, max_subdivisions);
 				else
-					create_mesh(scene, mesh, b_mesh, used_shaders);
+					create_mesh(scene, mesh, b_mesh, used_shaders, false);
 
 				create_mesh_volume_attributes(scene, b_ob, mesh, b_scene.frame_current());
 			}
 
-			if(render_layer.use_hair)
+			if(render_layer.use_hair && mesh->subdivision_type == Mesh::SUBDIVISION_NONE)
 				sync_curves(mesh, b_mesh, b_ob, false);
 
 			if(can_free_caches) {
@@ -843,21 +1006,6 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	}
 	mesh->geometry_flags = requested_geometry_flags;
 
-	/* displacement method */
-	if(cmesh.data) {
-		const int method = get_enum(cmesh,
-		                            "displacement_method",
-		                            Mesh::DISPLACE_NUM_METHODS,
-		                            Mesh::DISPLACE_BUMP);
-
-		if(method == 0 || !experimental)
-			mesh->displacement_method = Mesh::DISPLACE_BUMP;
-		else if(method == 1)
-			mesh->displacement_method = Mesh::DISPLACE_TRUE;
-		else
-			mesh->displacement_method = Mesh::DISPLACE_BOTH;
-	}
-
 	/* fluid motion */
 	sync_mesh_fluid_motion(b_ob, scene, mesh);
 
@@ -957,7 +1105,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 	if(ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
 		/* get derived mesh */
-		b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false);
+		b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false, false);
 	}
 
 	if(!b_mesh) {
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 22a0b3988c8..f305e8e17cc 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -253,8 +253,18 @@ static bool object_boundbox_clip(Scene *scene,
 		                       boundbox[3 * i + 1],
 		                       boundbox[3 * i + 2]);
 		p = transform_point(&tfm, p);
-		p = transform_perspective(&worldtondc, p);
-		if(p.z >= -margin) {
+
+		float4 b = make_float4(p.x, p.y, p.z, 1.0f);
+		float4 c = make_float4(dot(worldtondc.x, b),
+		                       dot(worldtondc.y, b),
+		                       dot(worldtondc.z, b),
+		                       dot(worldtondc.w, b));
+		p = float4_to_float3(c / c.w);
+		if(c.z < 0.0f) {
+			p.x = 1.0f - p.x;
+			p.y = 1.0f - p.y;
+		}
+		if(c.z >= -margin) {
 			all_behind = false;
 		}
 		bb_min = min(bb_min, p);
@@ -319,16 +329,18 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 			/* object transformation */
 			if(tfm != object->tfm) {
 				VLOG(1) << "Object " << b_ob.name() << " motion detected.";
-				if(motion_time == -1.0f) {
-					object->motion.pre = tfm;
-					object->use_motion = true;
-				}
-				else if(motion_time == 1.0f) {
-					object->motion.post = tfm;
+				if(motion_time == -1.0f || motion_time == 1.0f) {
 					object->use_motion = true;
 				}
 			}
 
+			if(motion_time == -1.0f) {
+				object->motion.pre = tfm;
+			}
+			else if(motion_time == 1.0f) {
+				object->motion.post = tfm;
+			}
+
 			/* mesh deformation */
 			if(object->mesh)
 				sync_mesh_motion(b_ob, object, motion_time);
@@ -385,8 +397,8 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object->name = b_ob.name().c_str();
 		object->pass_id = b_ob.pass_index();
 		object->tfm = tfm;
-		object->motion.pre = tfm;
-		object->motion.post = tfm;
+		object->motion.pre = transform_empty();
+		object->motion.post = transform_empty();
 		object->use_motion = false;
 
 		/* motion blur */
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 64559804ccb..534bc6cc897 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -64,6 +64,14 @@ static VolumeInterpolation get_volume_interpolation(PointerRNA& ptr)
 	                                     VOLUME_INTERPOLATION_LINEAR);
 }
 
+static DisplacementMethod get_displacement_method(PointerRNA& ptr)
+{
+	return (DisplacementMethod)get_enum(ptr,
+	                                    "displacement_method",
+	                                    DISPLACE_NUM_METHODS,
+	                                    DISPLACE_BUMP);
+}
+
 static int validate_enum_value(int value, int num_values, int default_value)
 {
 	if(value >= num_values) {
@@ -837,8 +845,10 @@ static ShaderNode *add_node(Scene *scene,
 		}
 	}
 
-	if(node)
+	if(node) {
+		node->name = b_node.name();
 		graph->add(node);
+	}
 
 	return node;
 }
@@ -1180,6 +1190,7 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
 			shader->volume_sampling_method = get_volume_sampling(cmat);
 			shader->volume_interpolation_method = get_volume_interpolation(cmat);
+			shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP;
 
 			shader->set_graph(graph);
 			shader->tag_update(scene);
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 188d23d0c59..d5dbaba094b 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -45,14 +45,39 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                       BL::Scene& scene,
                                       bool apply_modifiers,
                                       bool render,
-                                      bool calc_undeformed)
+                                      bool calc_undeformed,
+                                      bool subdivision)
 {
+	bool subsurf_mod_show_render;
+	bool subsurf_mod_show_viewport;
+
+	if(subdivision) {
+		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
+
+		subsurf_mod_show_render = subsurf_mod.show_render();
+		subsurf_mod_show_viewport = subsurf_mod.show_render();
+
+		subsurf_mod.show_render(false);
+		subsurf_mod.show_viewport(false);
+
+	}
+
 	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);
+
+	if(subdivision) {
+		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
+
+		subsurf_mod.show_render(subsurf_mod_show_render);
+		subsurf_mod.show_viewport(subsurf_mod_show_viewport);
+	}
+
 	if((bool)me) {
 		if(me.use_auto_smooth()) {
 			me.calc_normals_split();
 		}
-		me.calc_tessface(true);
+		if(!subdivision) {
+			me.calc_tessface(true);
+		}
 	}
 	return me;
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 2d404918a38..a85f34082db 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -249,121 +249,161 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}
 
+	/* Common NVCC flags which stays the same regardless of shading model,
+	 * kernel sources md5 and only depends on compiler or compilation settings.
+	 */
+	string compile_kernel_get_common_cflags(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		const int cuda_version = cuewCompilerVersion();
+		const int machine = system_cpu_bits();
+		const string kernel_path = path_get("kernel");
+		const string include = kernel_path;
+		string cflags = string_printf("-m%d "
+		                              "--ptxas-options=\"-v\" "
+		                              "--use_fast_math "
+		                              "-DNVCC "
+		                              "-D__KERNEL_CUDA_VERSION__=%d "
+		                               "-I\"%s\"",
+		                              machine,
+		                              cuda_version,
+		                              include.c_str());
+		if(use_adaptive_compilation()) {
+			cflags += " " + requested_features.get_build_options();
+		}
+		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+		if(extra_cflags) {
+			cflags += string(" ") + string(extra_cflags);
+		}
+#ifdef WITH_CYCLES_DEBUG
+		cflags += " -D__KERNEL_DEBUG__";
+#endif
+		return cflags;
+	}
+
+	bool compile_check_compiler() {
+		const char *nvcc = cuewCompilerPath();
+		if(nvcc == NULL) {
+			cuda_error_message("CUDA nvcc compiler not found. "
+			                   "Install CUDA toolkit in default location.");
+			return false;
+		}
+		const int cuda_version = cuewCompilerVersion();
+		VLOG(1) << "Found nvcc " << nvcc
+		        << ", CUDA version " << cuda_version
+		        << ".";
+		const int major = cuda_version / 10, minor = cuda_version & 10;
+		if(cuda_version == 0) {
+			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+			return false;
+		}
+		if(cuda_version < 75) {
+			printf("Unsupported CUDA version %d.%d detected, "
+			       "you need CUDA 7.5 or newer.\n",
+			       major, minor);
+			return false;
+		}
+		else if(cuda_version != 75 && cuda_version != 80) {
+			printf("CUDA version %d.%d detected, build may succeed but only "
+			       "CUDA 7.5 and 8.0 are officially supported.\n",
+			       major, minor);
+		}
+		return true;
+	}
+
 	string compile_kernel(const DeviceRequestedFeatures& requested_features)
 	{
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
-		string cubin;
-
-		/* Adaptive Compile.
-		 * If enabled, always use that */
-		bool use_adaptive_compile = use_adaptive_compilation();
 
 		/* Attempt to use kernel provided with Blender. */
-		if(!use_adaptive_compile) {
-			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
-			VLOG(1) << "Testing for pre-compiled kernel " << cubin;
+		if(!use_adaptive_compilation()) {
+			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			                                            major, minor));
+			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
-				VLOG(1) << "Using precompiled kernel";
+				VLOG(1) << "Using precompiled kernel.";
 				return cubin;
 			}
 		}
 
+		const string common_cflags =
+		        compile_kernel_get_common_cflags(requested_features);
+
 		/* Try to use locally compiled kernel. */
-		string kernel_path = path_get("kernel");
-		string md5 = path_files_md5_hash(kernel_path);
-
-		string feature_build_options;
-		if(use_adaptive_compile) {
-			feature_build_options = requested_features.get_build_options();
-			string device_md5 = util_md5_string(feature_build_options);
-			cubin = string_printf("cycles_kernel_%s_sm%d%d_%s.cubin",
-		                          device_md5.c_str(),
-		                          major, minor,
-		                          md5.c_str());
-		}
-		else {
-			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
-		}
+		const string kernel_path = path_get("kernel");
+		const string kernel_md5 = path_files_md5_hash(kernel_path);
+
+		/* We include cflags into md5 so changing cuda toolkit or changing other
+		 * compiler command line arguments makes sure cubin gets re-built.
+		 */
+		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		cubin = path_user_get(path_join("cache", cubin));
-		VLOG(1) << "Testing for locally compiled kernel " << cubin;
-		/* If exists already, use it. */
+		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		                                        major, minor,
+		                                        cubin_md5.c_str());
+		const string cubin = path_user_get(path_join("cache", cubin_file));
+		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
 		if(path_exists(cubin)) {
-			VLOG(1) << "Using locally compiled kernel";
+			VLOG(1) << "Using locally compiled kernel.";
 			return cubin;
 		}
 
 #ifdef _WIN32
 		if(have_precompiled_kernels()) {
-			if(major < 2)
-				cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
-			else
-				cuda_error_message(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
+			if(major < 2) {
+				cuda_error_message(string_printf(
+				        "CUDA device requires compute capability 2.0 or up, "
+				        "found %d.%d. Your GPU is not supported.",
+				        major, minor));
+			}
+			else {
+				cuda_error_message(string_printf(
+				        "CUDA binary kernel for this graphics card compute "
+				        "capability (%d.%d) not found.",
+				        major, minor));
+			}
 			return "";
 		}
 #endif
 
-		/* If not, find CUDA compiler. */
-		const char *nvcc = cuewCompilerPath();
-
-		if(nvcc == NULL) {
-			cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
-			return "";
-		}
-
-		int cuda_version = cuewCompilerVersion();
-		VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version;
-
-		if(cuda_version == 0) {
-			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-			return "";
-		}
-		if(cuda_version < 60) {
-			printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5.\n", cuda_version/10, cuda_version%10);
+		/* Compile. */
+		if(!compile_check_compiler()) {
 			return "";
 		}
-		else if(cuda_version != 75)
-			printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10);
-
-		/* Compile. */
-		string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
-		string include = kernel_path;
-		const int machine = system_cpu_bits();
-
+		const char *nvcc = cuewCompilerPath();
+		const string kernel = path_join(kernel_path,
+		                          path_join("kernels",
+		                                    path_join("cuda", "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
 		path_create_directories(cubin);
 
-		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" --use_fast_math -I\"%s\" "
-			"-DNVCC -D__KERNEL_CUDA_VERSION__=%d",
-			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
-
-		if(use_adaptive_compile)
-			command += " " + feature_build_options;
-
-		const char* extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-		if(extra_cflags) {
-			command += string(" ") + string(extra_cflags);
-		}
-
-#ifdef WITH_CYCLES_DEBUG
-		command += " -D__KERNEL_DEBUG__";
-#endif
+		string command = string_printf("\"%s\" "
+		                               "-arch=sm_%d%d "
+		                               "--cubin \"%s\" "
+		                               "-o \"%s\" "
+		                               "%s ",
+		                               nvcc,
+		                               major, minor,
+		                               kernel.c_str(),
+		                               cubin.c_str(),
+		                               common_cflags.c_str());
 
 		printf("%s\n", command.c_str());
 
 		if(system(command.c_str()) == -1) {
-			cuda_error_message("Failed to execute compilation command, see console for details.");
+			cuda_error_message("Failed to execute compilation command, "
+			                   "see console for details.");
 			return "";
 		}
 
 		/* Verify if compilation succeeded */
 		if(!path_exists(cubin)) {
-			cuda_error_message("CUDA kernel compilation failed, see console for details.");
+			cuda_error_message("CUDA kernel compilation failed, "
+			                   "see console for details.");
 			return "";
 		}
 
@@ -964,11 +1004,11 @@ public:
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem];
 			CUdeviceptr buffer;
-			
+
 			size_t bytes;
 			cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
 			cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-			
+
 			return buffer;
 		}
 
@@ -1000,9 +1040,9 @@ public:
 				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
 			else
 				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
-			
+
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-			
+
 			glGenTextures(1, &pmem.cuTexId);
 			glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
 			if(mem.data_type == TYPE_HALF)
@@ -1012,7 +1052,7 @@ public:
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 			glBindTexture(GL_TEXTURE_2D, 0);
-			
+
 			CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
 
 			if(result == CUDA_SUCCESS) {
@@ -1114,9 +1154,9 @@ public:
 			else
 				glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-			
+
 			glEnable(GL_TEXTURE_2D);
-			
+
 			if(transparent) {
 				glEnable(GL_BLEND);
 				glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
@@ -1181,7 +1221,7 @@ public:
 
 			if(transparent)
 				glDisable(GL_BLEND);
-			
+
 			glBindTexture(GL_TEXTURE_2D, 0);
 			glDisable(GL_TEXTURE_2D);
 
@@ -1197,12 +1237,12 @@ public:
 	{
 		if(task->type == DeviceTask::PATH_TRACE) {
 			RenderTile tile;
-			
+
 			bool branched = task->integrator_branched;
 
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
-			
+
 			/* keep rendering tiles until done */
 			while(task->acquire_tile(this, tile)) {
 				int start_sample = tile.start_sample;
@@ -1339,7 +1379,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
 		return;
 	}
-	
+
 	vector<DeviceInfo> display_devices;
 
 	for(int num = 0; num < count; num++) {
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 50490f3a20e..5c05aeb5569 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -875,6 +875,7 @@ public:
 
 		if(ciErr != CL_SUCCESS) {
 			opencl_error("OpenCL build failed: errors in console");
+			fprintf(stderr, "Build error: %s\n", clewErrorString(ciErr));
 			return false;
 		}
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index bd3969b2889..7bef247d3bd 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -87,6 +87,7 @@ set(SRC_KERNELS_CPU_HEADERS
 )
 
 set(SRC_CLOSURE_HEADERS
+	closure/alloc.h
 	closure/bsdf.h
 	closure/bsdf_ashikhmin_velvet.h
 	closure/bsdf_diffuse.h
@@ -140,6 +141,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_noisetex.h
 	svm/svm_normal.h
 	svm/svm_ramp.h
+	svm/svm_ramp_util.h
 	svm/svm_sepcomb_hsv.h
 	svm/svm_sepcomb_vector.h
 	svm/svm_sky.h
@@ -160,7 +162,9 @@ set(SRC_GEOM_HEADERS
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
+	geom/geom_patch.h
 	geom/geom_primitive.h
+	geom/geom_subd_triangle.h
 	geom/geom_triangle.h
 	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
@@ -241,12 +245,20 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			set(cuda_debug_flags "")
 		endif()
 
-		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
+		set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
+		set(cuda_nvcc_version ${CUDA_VERSION})
+
+		if(DEFINED CUDA_NVCC8_EXECUTABLE  AND ((${arch} STREQUAL "sm_60") OR (${arch} STREQUAL "sm_61")))
+			set(cuda_nvcc_command ${CUDA_NVCC8_EXECUTABLE})
+			set(cuda_nvcc_version "80")
+		endif()
+
+		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
-			COMMAND ${CUDA_NVCC_EXECUTABLE}
+			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
@@ -263,7 +275,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
-
 			DEPENDS ${cuda_sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
@@ -271,6 +282,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 		unset(cuda_extra_flags)
 		unset(cuda_debug_flags)
+
+		unset(cuda_nvcc_command)
+		unset(cuda_nvcc_version)
 	endmacro()
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index b27afaa9869..e9eeff31ecc 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -37,11 +37,16 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            uint *num_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 uint *num_hits)
 {
 	/* todo:
 	 * - likely and unlikely for if() statements
@@ -254,9 +259,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 						/* shadow ray early termination */
 						if(hit) {
-							/* Update number of hits now, so we do proper check on max bounces. */
-							(*num_hits)++;
-
 							/* detect if this surface has a shader with transparent shadows */
 
 							/* todo: optimize so primitive visibility flag indicates if
@@ -283,15 +285,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								return true;
 							}
 							/* if maximum number of hits reached, block all light */
-							else if(*num_hits >= max_hits) {
+							else if(*num_hits == max_hits) {
 								return true;
 							}
 
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
 #if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
-							/* Move on to next entry in intersections array */
-							isect_array++;
+
+							isect_array->t = isect_t;
 						}
 
 						prim_addr++;
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index 18978efcfa3..d9623c94b2e 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -35,12 +35,17 @@
  *
  */
 
-ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            SubsurfaceIntersection *ss_isect,
-                                            int subsurface_object,
-                                            uint *lcg_state,
-                                            int max_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 SubsurfaceIntersection *ss_isect,
+                                 int subsurface_object,
+                                 uint *lcg_state,
+                                 int max_hits)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 68a11b65ad7..b1a52968a26 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -40,16 +40,21 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                            , uint *lcg_state,
-                                            float difl,
-                                            float extmax
+                                 , uint *lcg_state,
+                                 float difl,
+                                 float extmax
 #endif
-                                            )
+                                 )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 03499e94347..107373c17dc 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -36,10 +36,15 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index d7f6bf86c71..1f6515c9862 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -36,11 +36,16 @@
  *
  */
 
-ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 const uint visibility)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
@@ -201,12 +206,14 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                         object,
 								                         prim_addr);
 								if(hit) {
-									/* Update number of hits now, so we do proper check on max bounces. */
+									/* Move on to next entry in intersections array. */
+									isect_array++;
 									num_hits++;
 #if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #endif
-									if(num_hits >= max_hits) {
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
 #if BVH_FEATURE(BVH_INSTANCING)
 #  if BVH_FEATURE(BVH_MOTION)
 										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
@@ -220,9 +227,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
-									/* Move on to next entry in intersections array */
-									isect_array++;
-									isect_array->t = isect_t;
 								}
 							}
 							break;
@@ -247,12 +251,14 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                                object,
 								                                prim_addr);
 								if(hit) {
-									/* Update number of hits now, so we do proper check on max bounces. */
+									/* Move on to next entry in intersections array. */
+									isect_array++;
 									num_hits++;
 #  if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #  endif
-									if(num_hits >= max_hits) {
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
 #  if BVH_FEATURE(BVH_INSTANCING)
 #    if BVH_FEATURE(BVH_MOTION)
 										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
@@ -266,9 +272,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
-									/* Move on to next entry in intersections array */
-									isect_array++;
-									isect_array->t = isect_t;
 								}
 							}
 							break;
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index eb98eaf7455..3a728b388eb 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -337,9 +337,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 						/* Shadow ray early termination. */
 						if(hit) {
-							/* Update number of hits now, so we do proper check on max bounces. */
-							(*num_hits)++;
-
 							/* detect if this surface has a shader with transparent shadows */
 
 							/* todo: optimize so primitive visibility flag indicates if
@@ -366,15 +363,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								return true;
 							}
 							/* if maximum number of hits reached, block all light */
-							else if(*num_hits >= max_hits) {
+							else if(*num_hits == max_hits) {
 								return true;
 							}
 
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
 #if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
-							/* Move on to next entry in intersections array */
-							isect_array++;
+
 							isect_array->t = isect_t;
 						}
 
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index 90cad9d91c0..4d3028b37bf 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -268,12 +268,14 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								/* Intersect ray against primitive. */
 								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
 								if(hit) {
-									/* Update number of hits now, so we do proper check on max bounces. */
+									/* Move on to next entry in intersections array. */
+									isect_array++;
 									num_hits++;
 #if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #endif
-									if(num_hits >= max_hits) {
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
 #if BVH_FEATURE(BVH_INSTANCING)
 #  if BVH_FEATURE(BVH_MOTION)
 										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
@@ -287,9 +289,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
-									/* Move on to next entry in intersections array */
-									isect_array++;
-									isect_array->t = isect_t;
 								}
 							}
 							break;
@@ -307,12 +306,14 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								/* Intersect ray against primitive. */
 								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
 								if(hit) {
-									/* Update number of hits now, so we do proper check on max bounces. */
+									/* Move on to next entry in intersections array. */
+									isect_array++;
 									num_hits++;
 #  if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #  endif
-									if(num_hits >= max_hits) {
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
 #  if BVH_FEATURE(BVH_INSTANCING)
 #    if BVH_FEATURE(BVH_MOTION)
 										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
@@ -326,9 +327,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #  endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
-									/* Move on to next entry in intersections array */
-									isect_array++;
-									isect_array->t = isect_t;
 								}
 							}
 							break;
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
new file mode 100644
index 00000000000..b7abc1ec507
--- /dev/null
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
+{
+	kernel_assert(size <= sizeof(ShaderClosure));
+
+	int num_closure = ccl_fetch(sd, num_closure);
+	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+	if(num_closure + num_closure_extra >= MAX_CLOSURE)
+		return NULL;
+
+	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+
+	sc->type = type;
+	sc->weight = weight;
+
+	ccl_fetch(sd, num_closure)++;
+
+	return sc;
+}
+
+ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
+{
+	/* Allocate extra space for closure that need more parameters. We allocate
+	 * in chunks of sizeof(ShaderClosure) starting from the end of the closure
+	 * array.
+	 *
+	 * This lets us keep the same fast array iteration over closures, as we
+	 * found linked list iteration and iteration with skipping to be slower. */
+	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
+	int num_closure = ccl_fetch(sd, num_closure);
+	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+
+	if(num_closure + num_closure_extra > MAX_CLOSURE) {
+		/* Remove previous closure. */
+		ccl_fetch(sd, num_closure)--;
+		ccl_fetch(sd, num_closure_extra)++;
+		return NULL;
+	}
+
+	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
+	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+}
+
+ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
+{
+	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+
+	if(!sc)
+		return NULL;
+
+	float sample_weight = fabsf(average(weight));
+	sc->sample_weight = sample_weight;
+	return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? sc : NULL;
+}
+
+#ifdef __OSL__
+ccl_device_inline ShaderClosure *bsdf_alloc_osl(ShaderData *sd, int size, float3 weight, void *data)
+{
+	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+
+	if(!sc)
+		return NULL;
+
+	memcpy(sc, data, size);
+
+	float sample_weight = fabsf(average(weight));
+	sc->weight = weight;
+	sc->sample_weight = sample_weight;
+	return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? sc : NULL;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index f318a61f3a3..1e7fbdb5450 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -36,15 +36,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  const ShaderClosure *sc,
+                                  float randu,
+                                  float randv,
+                                  float3 *eval,
+                                  float3 *omega_in,
+                                  differential3 *domega_in,
+                                  float *pdf)
 {
 	int label;
 
-#ifdef __OSL__
-	if(kg->osl && sc->prim)
-		return OSLShader::bsdf_sample(sd, sc, randu, randv, *eval, *omega_in, *domega_in, *pdf);
-#endif
-
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
@@ -56,14 +59,16 @@ ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosur
 			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
+#ifdef __OSL__
+		case CLOSURE_BSDF_PHONG_RAMP_ID:
 			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
 			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;*/
+			break;
+#endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
 			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -139,15 +144,19 @@ ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosur
 	return label;
 }
 
-ccl_device float3 bsdf_eval(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, const float3 omega_in, float *pdf)
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+float3 bsdf_eval(KernelGlobals *kg,
+                 ShaderData *sd,
+                 const ShaderClosure *sc,
+                 const float3 omega_in,
+                 float *pdf)
 {
 	float3 eval;
 
-#ifdef __OSL__
-	if(kg->osl && sc->prim)
-		return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
-#endif
-
 	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
@@ -158,12 +167,14 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, ShaderData *sd, const ShaderClosu
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
 				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
-			/*case CLOSURE_BSDF_PHONG_RAMP_ID:
+#ifdef __OSL__
+			case CLOSURE_BSDF_PHONG_RAMP_ID:
 				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
 				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
-				break;*/
+				break;
+#endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
 				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
@@ -296,15 +307,7 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, ShaderData *sd, const ShaderClosu
 
 ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
-/* ToDo: do we want to blur volume closures? */
-
-#ifdef __OSL__
-	if(kg->osl && sc->prim) {
-		OSLShader::bsdf_blur(sc, roughness);
-		return;
-	}
-#endif
-
+	/* ToDo: do we want to blur volume closures? */
 #ifdef __SVM__
 	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
@@ -331,5 +334,48 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #endif
 }
 
+ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
+{
+#ifdef __SVM__
+	switch(a->type) {
+		case CLOSURE_BSDF_TRANSPARENT_ID:
+			return true;
+		case CLOSURE_BSDF_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_ID:
+		case CLOSURE_BSDF_TRANSLUCENT_ID:
+			return bsdf_diffuse_merge(a, b);
+		case CLOSURE_BSDF_OREN_NAYAR_ID:
+			return bsdf_oren_nayar_merge(a, b);
+		case CLOSURE_BSDF_REFLECTION_ID:
+		case CLOSURE_BSDF_REFRACTION_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+			return bsdf_microfacet_merge(a, b);
+		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
+			return bsdf_ashikhmin_velvet_merge(a, b);
+		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
+		case CLOSURE_BSDF_GLOSSY_TOON_ID:
+			return bsdf_toon_merge(a, b);
+		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
+		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
+			return bsdf_hair_merge(a, b);
+#ifdef __VOLUME__
+		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
+			return volume_henyey_greenstein_merge(a, b);
+#endif
+		default:
+			return false;
+	}
+#endif
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 8d7d533d6f8..9929246ae5c 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -31,28 +31,30 @@ Other than that, the implementation directly follows the paper.
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
+ccl_device int bsdf_ashikhmin_shirley_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
-	sc->data1 = sc->data0;
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = bsdf->alpha_x;
 
-	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
+	bsdf->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
+ccl_device int bsdf_ashikhmin_shirley_aniso_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
-	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = clamp(bsdf->alpha_y, 1e-4f, 1.0f);
 
-	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+	bsdf->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* clamp roughness */
-	sc->data1 = fmaxf(roughness, sc->data1);
+	MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc;
+
+	bsdf->alpha_x = fmaxf(roughness, bsdf->alpha_x);
+	bsdf->alpha_y = fmaxf(roughness, bsdf->alpha_y);
 }
 
 ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float roughness)
@@ -60,16 +62,21 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 	return 2.0f / (roughness*roughness) - 2.0f;
 }
 
-ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+        const ShaderClosure *sc,
+        const float3 I,
+        const float3 omega_in,
+        float *pdf)
 {
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float3 N = bsdf->N;
 
 	float NdotI = dot(N, I);           /* in Cycles/OSL convention I is omega_out    */
 	float NdotO = dot(N, omega_in);    /* and consequently we use for O omaga_in ;)  */
 
 	float out = 0.0f;
 
-	if(fmaxf(sc->data0, sc->data1) <= 1e-4f)
+	if(fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 	if(NdotI > 0.0f && NdotO > 0.0f) {
@@ -82,8 +89,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */
 		/*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */
 
-		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
-		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_x);
+		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_y);
 
 		if(n_x == n_y) {
 			/* isotropic */
@@ -97,12 +104,18 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		else {
 			/* anisotropic */
 			float3 X, Y;
-			make_orthonormals_tangent(N, sc->T, &X, &Y);
+			make_orthonormals_tangent(N, bsdf->T, &X, &Y);
 
 			float HdotX = dot(H, X);
 			float HdotY = dot(H, Y);
-			float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN);
-			float lobe = powf(HdotN, e);
+			float lobe;
+			if(HdotN < 1.0f) {
+				float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN);
+				lobe = powf(HdotN, e);
+			}
+			else {
+				lobe = 1.0f;
+			}
 			float norm = sqrtf((n_x + 1.0f)*(n_y + 1.0f)) / (8.0f * M_PI_F);
 			
 			out = NdotO * norm * lobe * pump;
@@ -128,13 +141,14 @@ ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x, f
 
 ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float3 N = bsdf->N;
 
 	float NdotI = dot(N, I);
 	if(NdotI > 0.0f) {
 
-		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
-		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_x);
+		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_y);
 
 		/* get x,y basis on the surface for anisotropy */
 		float3 X, Y;
@@ -142,7 +156,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		if(n_x == n_y)
 			make_orthonormals(N, &X, &Y);
 		else
-			make_orthonormals_tangent(N, sc->T, &X, &Y);
+			make_orthonormals_tangent(N, bsdf->T, &X, &Y);
 
 		/* sample spherical coords for h in tangent space */
 		float phi;
@@ -193,7 +207,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* reflect I on H to get omega_in */
 		*omega_in = -I + (2.0f * HdotI) * H;
 
-		if(fmaxf(sc->data0, sc->data1) <= 1e-4f) {
+		if(fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f) {
 			/* Some high number for MIS. */
 			*pdf = 1e6f;
 			*eval = make_float3(1e6f, 1e6f, 1e6f);
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index f1a26650078..7e0f5a7ec75 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -35,20 +35,38 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
+typedef ccl_addr_space struct VelvetBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float sigma;
+	float invsigma2;
+	float3 N;
+} VelvetBsdf;
+
+ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
 {
-	float sigma = fmaxf(sc->data0, 0.01f);
-	sc->data0 = 1.0f/(sigma * sigma); /* m_invsigma2 */
+	float sigma = fmaxf(bsdf->sigma, 0.01f);
+	bsdf->invsigma2 = 1.0f/(sigma * sigma);
 	
-	sc->type = CLOSURE_BSDF_ASHIKHMIN_VELVET_ID;
+	bsdf->type = CLOSURE_BSDF_ASHIKHMIN_VELVET_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const VelvetBsdf *bsdf_a = (const VelvetBsdf*)a;
+	const VelvetBsdf *bsdf_b = (const VelvetBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N)) &&
+	       (bsdf_a->sigma == bsdf_b->sigma);
+}
+
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_invsigma2 = sc->data0;
-	float3 N = sc->N;
+	const VelvetBsdf *bsdf = (const VelvetBsdf*)sc;
+	float m_invsigma2 = bsdf->invsigma2;
+	float3 N = bsdf->N;
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -90,8 +108,9 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, c
 
 ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float m_invsigma2 = sc->data0;
-	float3 N = sc->N;
+	const VelvetBsdf *bsdf = (const VelvetBsdf*)sc;
+	float m_invsigma2 = bsdf->invsigma2;
+	float3 N = bsdf->N;
 
 	// we are viewing the surface from above - send a ray out with uniform
 	// distribution over the hemisphere
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 4b29bb096d1..dcd187f9305 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -35,17 +35,31 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct DiffuseBsdf {
+	SHADER_CLOSURE_BASE;
+	float3 N;
+} DiffuseBsdf;
+
 /* DIFFUSE */
 
-ccl_device int bsdf_diffuse_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_DIFFUSE_ID;
+	bsdf->type = CLOSURE_BSDF_DIFFUSE_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const DiffuseBsdf *bsdf_a = (const DiffuseBsdf*)a;
+	const DiffuseBsdf *bsdf_b = (const DiffuseBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N));
+}
+
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
+	float3 N = bsdf->N;
 
 	float cos_pi = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
 	*pdf = cos_pi;
@@ -59,7 +73,8 @@ ccl_device float3 bsdf_diffuse_eval_transmit(const ShaderClosure *sc, const floa
 
 ccl_device int bsdf_diffuse_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
+	float3 N = bsdf->N;
 
 	// distribution over the hemisphere
 	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
@@ -80,9 +95,9 @@ ccl_device int bsdf_diffuse_sample(const ShaderClosure *sc, float3 Ng, float3 I,
 
 /* TRANSLUCENT */
 
-ccl_device int bsdf_translucent_setup(ShaderClosure *sc)
+ccl_device int bsdf_translucent_setup(DiffuseBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_TRANSLUCENT_ID;
+	bsdf->type = CLOSURE_BSDF_TRANSLUCENT_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
@@ -93,7 +108,8 @@ ccl_device float3 bsdf_translucent_eval_reflect(const ShaderClosure *sc, const f
 
 ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
+	float3 N = bsdf->N;
 
 	float cos_pi = fmaxf(-dot(N, omega_in), 0.0f) * M_1_PI_F;
 	*pdf = cos_pi;
@@ -102,7 +118,8 @@ ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const
 
 ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
+	float3 N = bsdf->N;
 
 	// we are viewing the surface from the right side - send a ray out with cosine
 	// distribution over the hemisphere
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index e0287e7655a..2d982a95fe4 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -35,7 +35,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
+#ifdef __OSL__
+
+typedef ccl_addr_space struct DiffuseRampBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float3 N;
+	float3 *colors;
+} DiffuseRampBsdf;
+
+ccl_device float3 bsdf_diffuse_ramp_get_color(const float3 colors[8], float pos)
 {
 	int MAXCOLORS = 8;
 	
@@ -49,11 +58,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
 }
 
-ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_ramp_setup(DiffuseRampBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
-	sc->data0 = 0.0f;
-	sc->data1 = 0.0f;
+	bsdf->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
@@ -61,29 +68,31 @@ ccl_device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-ccl_device float3 bsdf_diffuse_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_ramp_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseRampBsdf *bsdf = (const DiffuseRampBsdf*)sc;
+	float3 N = bsdf->N;
 
 	float cos_pi = fmaxf(dot(N, omega_in), 0.0f);
 	*pdf = cos_pi * M_1_PI_F;
-	return bsdf_diffuse_ramp_get_color(sc, colors, cos_pi) * M_1_PI_F;
+	return bsdf_diffuse_ramp_get_color(bsdf->colors, cos_pi) * M_1_PI_F;
 }
 
-ccl_device float3 bsdf_diffuse_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_ramp_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float3 N = sc->N;
+	const DiffuseRampBsdf *bsdf = (const DiffuseRampBsdf*)sc;
+	float3 N = bsdf->N;
 
 	// distribution over the hemisphere
 	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
 
 	if(dot(Ng, *omega_in) > 0.0f) {
-		*eval = bsdf_diffuse_ramp_get_color(sc, colors, *pdf * M_PI_F) * M_1_PI_F;
+		*eval = bsdf_diffuse_ramp_get_color(bsdf->colors, *pdf * M_PI_F) * M_1_PI_F;
 #ifdef __RAY_DIFFERENTIALS__
 		*domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
 		*domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
@@ -95,6 +104,8 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, const float3 co
 	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
+#endif /* __OSL__ */
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 1e81617a7d3..bede5f45e7e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -35,29 +35,49 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct HairBsdf {
+	SHADER_CLOSURE_BASE;
 
-ccl_device int bsdf_hair_reflection_setup(ShaderClosure *sc)
+	float3 T;
+	float roughness1;
+	float roughness2;
+	float offset;
+} HairBsdf;
+
+ccl_device int bsdf_hair_reflection_setup(HairBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
-	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
+	bsdf->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
+	bsdf->roughness1 = clamp(bsdf->roughness1, 0.001f, 1.0f);
+	bsdf->roughness2 = clamp(bsdf->roughness2, 0.001f, 1.0f);
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
+ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
-	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
+	bsdf->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
+	bsdf->roughness1 = clamp(bsdf->roughness1, 0.001f, 1.0f);
+	bsdf->roughness2 = clamp(bsdf->roughness2, 0.001f, 1.0f);
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const HairBsdf *bsdf_a = (const HairBsdf*)a;
+	const HairBsdf *bsdf_b = (const HairBsdf*)b;
+
+	return (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
+	       (bsdf_a->roughness1 == bsdf_b->roughness1) &&
+	       (bsdf_a->roughness2 == bsdf_b->roughness2) &&
+	       (bsdf_a->offset == bsdf_b->offset);
+}
+
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float offset = sc->data2;
-	float3 Tg = sc->T;
-	float roughness1 = sc->data0;
-	float roughness2 = sc->data1;
+	const HairBsdf *bsdf = (const HairBsdf*)sc;
+	float offset = bsdf->offset;
+	float3 Tg = bsdf->T;
+	float roughness1 = bsdf->roughness1;
+	float roughness2 = bsdf->roughness2;
 
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
@@ -107,10 +127,11 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co
 
 ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float offset = sc->data2;
-	float3 Tg = sc->T;
-	float roughness1 = sc->data0;
-	float roughness2 = sc->data1;
+	const HairBsdf *bsdf = (const HairBsdf*)sc;
+	float offset = bsdf->offset;
+	float3 Tg = bsdf->T;
+	float roughness1 = bsdf->roughness1;
+	float roughness2 = bsdf->roughness2;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 
@@ -148,10 +169,11 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
 
 ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float offset = sc->data2;
-	float3 Tg = sc->T;
-	float roughness1 = sc->data0;
-	float roughness2 = sc->data1;
+	const HairBsdf *bsdf = (const HairBsdf*)sc;
+	float offset = bsdf->offset;
+	float3 Tg = bsdf->T;
+	float roughness1 = bsdf->roughness1;
+	float roughness2 = bsdf->roughness2;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
@@ -198,10 +220,11 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
 
 ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float offset = sc->data2;
-	float3 Tg = sc->T;
-	float roughness1 = sc->data0;
-	float roughness2 = sc->data1;
+	const HairBsdf *bsdf = (const HairBsdf*)sc;
+	float offset = bsdf->offset;
+	float3 Tg = bsdf->T;
+	float roughness1 = bsdf->roughness1;
+	float roughness2 = bsdf->roughness2;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 7bf7c2806d4..9da73f66da0 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,6 +35,19 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct MicrofacetExtra {
+	float3 color;
+} MicrofacetExtra;
+
+typedef ccl_addr_space struct MicrofacetBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float alpha_x, alpha_y, ior;
+	MicrofacetExtra *extra;
+	float3 T;
+	float3 N;
+} MicrofacetBsdf;
+
 /* Beckmann and GGX microfacet importance sampling. */
 
 ccl_device_inline void microfacet_beckmann_sample_slopes(
@@ -233,48 +246,66 @@ ccl_device_inline float3 microfacet_sample_stretched(
  * Anisotropy is only supported for reflection currently, but adding it for
  * transmission is just a matter of copying code from reflection if needed. */
 
-ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = sc->data0; /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
 	
-	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
+ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
+	const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N)) &&
+	       (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
+	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
+	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
+	       (bsdf_a->ior == bsdf_b->ior) &&
+	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+            ((bsdf_a->extra && bsdf_b->extra) &&
+	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
+}
+
+ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = saturate(sc->data1); /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = saturate(bsdf->alpha_y);
 	
-	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = sc->data0; /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
 
-	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
-	sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
+	MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc;
+
+	bsdf->alpha_x = fmaxf(roughness, bsdf->alpha_x);
+	bsdf->alpha_y = fmaxf(roughness, bsdf->alpha_y);
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	if(m_refractive || alpha_x*alpha_y <= 1e-7f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -305,7 +336,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		else {
 			/* anisotropic */
 			float3 X, Y, Z = N;
-			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+			make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 
 			/* distribution */
 			float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
@@ -361,11 +392,12 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 
 ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	float m_eta = sc->data2;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	float m_eta = bsdf->ior;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	if(!m_refractive || alpha_x*alpha_y <= 1e-7f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -415,10 +447,11 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 
 ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -427,7 +460,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 		if(alpha_x == alpha_y)
 			make_orthonormals(Z, &X, &Y);
 		else
-			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+			make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 
 		/* importance sampling with distribution of visible normals. vectors are
 		 * transformed to local space before and after */
@@ -522,7 +555,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2, fresnel;
+			float m_eta = bsdf->ior, fresnel;
 			bool inside;
 
 			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
@@ -582,37 +615,39 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
  * Microfacet Models for Refraction through Rough Surfaces
  * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 */
 
-ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_beckmann_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = sc->data0; /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
 
-	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_beckmann_aniso_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = saturate(sc->data1); /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = saturate(bsdf->alpha_y);
 
-	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_beckmann_refraction_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = saturate(sc->data0); /* alpha_x */
-	sc->data1 = sc->data0; /* alpha_y */
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
 
-	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
-	sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
+	MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc;
+
+	bsdf->alpha_x = fmaxf(roughness, bsdf->alpha_x);
+	bsdf->alpha_y = fmaxf(roughness, bsdf->alpha_y);
 }
 
 ccl_device_inline float bsdf_beckmann_G1(float alpha, float cos_n)
@@ -647,10 +682,11 @@ ccl_device_inline float bsdf_beckmann_aniso_G1(float alpha_x, float alpha_y, flo
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	if(m_refractive || alpha_x*alpha_y <= 1e-7f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -682,7 +718,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 		else {
 			/* anisotropic */
 			float3 X, Y, Z = N;
-			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+			make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 
 			/* distribution */
 			float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
@@ -722,11 +758,12 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	float m_eta = sc->data2;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	float m_eta = bsdf->ior;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	if(!m_refractive || alpha_x*alpha_y <= 1e-7f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -773,10 +810,11 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 
 ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float alpha_x = sc->data0;
-	float alpha_y = sc->data1;
-	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float alpha_x = bsdf->alpha_x;
+	float alpha_y = bsdf->alpha_y;
+	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	float3 N = bsdf->N;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -785,7 +823,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 		if(alpha_x == alpha_y)
 			make_orthonormals(Z, &X, &Y);
 		else
-			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+			make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 
 		/* importance sampling with distribution of visible normals. vectors are
 		 * transformed to local space before and after */
@@ -872,7 +910,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2, fresnel;
+			float m_eta = bsdf->ior, fresnel;
 			bool inside;
 
 			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 51b12fe4e45..df848c3d179 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -42,7 +42,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
 ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
-	if(cosI > 0.9999f) {
+	if(cosI > 0.9999f || cosI < 1e-6f) {
 		const float r = sqrtf(randU.x / (1.0f - randU.x));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
@@ -117,7 +117,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
 	if(dotW_WH < 0.0f)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	float phase = max(0.0f, dotW_WH) * 0.25f / (pArea * dotW_WH);
+	float phase = max(0.0f, dotW_WH) * 0.25f / max(pArea * dotW_WH, 1e-7f);
 	if(alpha.x == alpha.y)
 		phase *= D_ggx(wh, alpha.x);
 	else
@@ -200,9 +200,9 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
 	if(w.z > 0.9999f)
 		return 0.0f;
 	else if(w.z < -0.9999f)
-		return -1.0f;
+		return -0.9999f;
 
-	const float inv_wz2 = 1.0f / (w.z*w.z);
+	const float inv_wz2 = 1.0f / max(w.z*w.z, 1e-7f);
 	const float2 wa = make_float2(w.x, w.y)*alpha;
 	float v = sqrtf(1.0f + dot(wa, wa) * inv_wz2);
 	if(w.z <= 0.0f)
@@ -271,7 +271,10 @@ ccl_device_inline float mf_ggx_albedo(float r)
 
 ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
-	return 0.25f * D_ggx(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, make_float2(alpha, alpha))) * wi.z) + (1.0f - mf_ggx_albedo(alpha)) * wo.z;
+	float D = D_ggx(normalize(wi+wo), alpha);
+	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+	float albedo = mf_ggx_albedo(alpha);
+	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
 }
 
 ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
@@ -325,40 +328,42 @@ ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const flo
 
 ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
-	sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
+	MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc;
+
+	bsdf->alpha_x = fmaxf(roughness, bsdf->alpha_x);
+	bsdf->alpha_y = fmaxf(roughness, bsdf->alpha_y);
 }
 
 /* === Closure implementations === */
 
 /* Multiscattering GGX Glossy closure */
 
-ccl_device int bsdf_microfacet_multi_ggx_common_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* alpha */
-	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
-	sc->custom1 = saturate(sc->custom1); /* color */
-	sc->custom2 = saturate(sc->custom2);
-	sc->custom3 = saturate(sc->custom3);
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = clamp(bsdf->alpha_y, 1e-4f, 1.0f);
+	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
 
-	sc->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG|SD_BSDF_HAS_CUSTOM;
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 {
-	if(is_zero(sc->T))
-		sc->T = make_float3(1.0f, 0.0f, 0.0f);
+	if(is_zero(bsdf->T))
+		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
-	return bsdf_microfacet_multi_ggx_common_setup(sc);
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data1 = sc->data0;
+	bsdf->alpha_y = bsdf->alpha_x;
 
-	return bsdf_microfacet_multi_ggx_common_setup(sc);
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -367,11 +372,12 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_transmit(const ShaderClosure *s
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
-	bool is_aniso = (sc->data0 != sc->data1);
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	float3 X, Y, Z;
-	Z = sc->N;
+	Z = bsdf->N;
 	if(is_aniso)
-		make_orthonormals_tangent(Z, sc->T, &X, &Y);
+		make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 	else
 		make_orthonormals(Z, &X, &Y);
 
@@ -379,94 +385,115 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	if(is_aniso)
-		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(sc->data0, sc->data1));
+		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
-		*pdf = mf_ggx_pdf(localI, localO, sc->data0);
-	return mf_eval_glossy(localI, localO, true, make_float3(sc->custom1, sc->custom2, sc->custom3), sc->data0, sc->data1, lcg_state, NULL, NULL);
+		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
 {
-	bool is_aniso = (sc->data0 != sc->data1);
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	float3 X, Y, Z;
-	Z = sc->N;
+	Z = bsdf->N;
 	if(is_aniso)
-		make_orthonormals_tangent(Z, sc->T, &X, &Y);
+		make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
 	else
 		make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glossy(localI, &localO, make_float3(sc->custom1, sc->custom2, sc->custom3), sc->data0, sc->data1, lcg_state, NULL, NULL);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
 	if(is_aniso)
-		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(sc->data0, sc->data1));
+		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
-		*pdf = mf_ggx_pdf(localI, localO, sc->data0);
+		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+#ifdef __RAY_DIFFERENTIALS__
+	*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+	*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 /* Multiscattering GGX Glass closure */
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
 {
-	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* alpha */
-	sc->data1 = sc->data0;
-	sc->data2 = max(0.0f, sc->data2); /* ior */
-	sc->custom1 = saturate(sc->custom1); /* color */
-	sc->custom2 = saturate(sc->custom2);
-	sc->custom3 = saturate(sc->custom3);
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = bsdf->alpha_x;
+	bsdf->ior = max(0.0f, bsdf->ior);
+	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
 
-	sc->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG|SD_BSDF_HAS_CUSTOM;
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 X, Y, Z;
-	Z = sc->N;
+	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
-	*pdf = mf_glass_pdf(localI, localO, sc->data0, sc->data2);
-	return mf_eval_glass(localI, localO, false, make_float3(sc->custom1, sc->custom2, sc->custom3), sc->data0, sc->data1, lcg_state, sc->data2);
+	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
+	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 X, Y, Z;
-	Z = sc->N;
+	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
-	*pdf = mf_glass_pdf(localI, localO, sc->data0, sc->data2);
-	return mf_eval_glass(localI, localO, true, make_float3(sc->custom1, sc->custom2, sc->custom3), sc->data0, sc->data1, lcg_state, sc->data2);
+	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
+	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
 {
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 X, Y, Z;
-	Z = sc->N;
+	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glass(localI, &localO, make_float3(sc->custom1, sc->custom2, sc->custom3), sc->data0, sc->data1, lcg_state, sc->data2);
-	*pdf = mf_glass_pdf(localI, localO, sc->data0, sc->data2);
+	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
-	if(localO.z*localI.z > 0.0f)
+	if(localO.z*localI.z > 0.0f) {
+#ifdef __RAY_DIFFERENTIALS__
+		*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+		*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 		return LABEL_REFLECT|LABEL_GLOSSY;
-	else
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		float cosI = dot(Z, I);
+		float dnp = max(sqrtf(1.0f - (bsdf->ior * bsdf->ior * (1.0f - cosI*cosI))), 1e-7f);
+		*domega_in_dx = -(bsdf->ior * dIdx) + ((bsdf->ior - bsdf->ior * bsdf->ior * cosI / dnp) * dot(dIdx, Z)) * Z;
+		*domega_in_dy = -(bsdf->ior * dIdy) + ((bsdf->ior - bsdf->ior * bsdf->ior * cosI / dnp) * dot(dIdy, Z)) * Z;
+#endif
+
 		return LABEL_TRANSMIT|LABEL_GLOSSY;
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index afd4a8da62a..6ebe2f6a751 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -25,11 +25,18 @@
  * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
  * the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state
+ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+        float3 wi,
+        float3 wo,
+        const bool wo_outside,
+        const float3 color,
+        const float alpha_x,
+        const float alpha_y,
+         ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
-	, const float eta
+        , const float eta
 #elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
+        , float3 *n, float3 *k
 #endif
 )
 {
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 61b7cb11b02..cb342a026ef 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -19,39 +19,59 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct OrenNayarBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float3 N;
+	float roughness;
+	float a;
+	float b;
+} OrenNayarBsdf;
+
 ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3 n, float3 v, float3 l)
 {
+	const OrenNayarBsdf *bsdf = (const OrenNayarBsdf*)sc;
 	float nl = max(dot(n, l), 0.0f);
 	float nv = max(dot(n, v), 0.0f);
 	float t = dot(l, v) - nl * nv;
 
 	if(t > 0.0f)
 		t /= max(nl, nv) + FLT_MIN;
-	float is = nl * (sc->data0 + sc->data1 * t);
+	float is = nl * (bsdf->a + bsdf->b * t);
 	return make_float3(is, is, is);
 }
 
-ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
+ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
 {
-	float sigma = sc->data0;
+	float sigma = bsdf->roughness;
 
-	sc->type = CLOSURE_BSDF_OREN_NAYAR_ID;
+	bsdf->type = CLOSURE_BSDF_OREN_NAYAR_ID;
 
 	sigma = saturate(sigma);
 
 	float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) * sigma);
 
-	sc->data0 = 1.0f * div;
-	sc->data1 = sigma * div;
+	bsdf->a = 1.0f * div;
+	bsdf->b = sigma * div;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf*)a;
+	const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N)) &&
+	       (bsdf_a->roughness == bsdf_b->roughness);
+}
+
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	if(dot(sc->N, omega_in) > 0.0f) {
+	const OrenNayarBsdf *bsdf = (const OrenNayarBsdf*)sc;
+	if(dot(bsdf->N, omega_in) > 0.0f) {
 		*pdf = 0.5f * M_1_PI_F;
-		return bsdf_oren_nayar_get_intensity(sc, sc->N, I, omega_in);
+		return bsdf_oren_nayar_get_intensity(sc, bsdf->N, I, omega_in);
 	}
 	else {
 		*pdf = 0.0f;
@@ -66,15 +86,16 @@ ccl_device float3 bsdf_oren_nayar_eval_transmit(const ShaderClosure *sc, const f
 
 ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	sample_uniform_hemisphere(sc->N, randu, randv, omega_in, pdf);
+	const OrenNayarBsdf *bsdf = (const OrenNayarBsdf*)sc;
+	sample_uniform_hemisphere(bsdf->N, randu, randv, omega_in, pdf);
 
 	if(dot(Ng, *omega_in) > 0.0f) {
-		*eval = bsdf_oren_nayar_get_intensity(sc, sc->N, I, *omega_in);
+		*eval = bsdf_oren_nayar_get_intensity(sc, bsdf->N, I, *omega_in);
 
 #ifdef __RAY_DIFFERENTIALS__
 		// TODO: find a better approximation for the bounce
-		*domega_in_dx = (2.0f * dot(sc->N, dIdx)) * sc->N - dIdx;
-		*domega_in_dy = (2.0f * dot(sc->N, dIdy)) * sc->N - dIdy;
+		*domega_in_dx = (2.0f * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
+		*domega_in_dy = (2.0f * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
 #endif
 	}
 	else {
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 1ab15eee954..e152a8780db 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -35,7 +35,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
+#ifdef __OSL__
+
+typedef ccl_addr_space struct PhongRampBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float3 N;
+	float exponent;
+	float3 *colors;
+} PhongRampBsdf;
+
+ccl_device float3 bsdf_phong_ramp_get_color(const float3 colors[8], float pos)
 {
 	int MAXCOLORS = 8;
 	
@@ -49,57 +59,54 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
 }
 
-ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
+ccl_device int bsdf_phong_ramp_setup(PhongRampBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_PHONG_RAMP_ID;
-	sc->data0 = max(sc->data0, 0.0f);
-	sc->data1 = 0.0f;
+	bsdf->type = CLOSURE_BSDF_PHONG_RAMP_ID;
+	bsdf->exponent = max(bsdf->exponent, 0.0f);
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_exponent = sc->data0;
-	float cosNI = dot(sc->N, omega_in);
-	float cosNO = dot(sc->N, I);
+	const PhongRampBsdf *bsdf = (const PhongRampBsdf*)sc;
+	float m_exponent = bsdf->exponent;
+	float cosNI = dot(bsdf->N, omega_in);
+	float cosNO = dot(bsdf->N, I);
 	
 	if(cosNI > 0 && cosNO > 0) {
 		// reflect the view vector
-		float3 R = (2 * cosNO) * sc->N - I;
+		float3 R = (2 * cosNO) * bsdf->N - I;
 		float cosRI = dot(R, omega_in);
 		if(cosRI > 0) {
 			float cosp = powf(cosRI, m_exponent);
 			float common = 0.5f * M_1_PI_F * cosp;
 			float out = cosNI * (m_exponent + 2) * common;
 			*pdf = (m_exponent + 1) * common;
-			return bsdf_phong_ramp_get_color(sc, colors, cosp) * out;
+			return bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out;
 		}
 	}
 	
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-ccl_device float3 bsdf_phong_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_phong_ramp_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float cosNO = dot(sc->N, I);
-	float m_exponent = sc->data0;
+	const PhongRampBsdf *bsdf = (const PhongRampBsdf*)sc;
+	float cosNO = dot(bsdf->N, I);
+	float m_exponent = bsdf->exponent;
 	
 	if(cosNO > 0) {
 		// reflect the view vector
-		float3 R = (2 * cosNO) * sc->N - I;
+		float3 R = (2 * cosNO) * bsdf->N - I;
 
 #ifdef __RAY_DIFFERENTIALS__
-		*domega_in_dx = (2 * dot(sc->N, dIdx)) * sc->N - dIdx;
-		*domega_in_dy = (2 * dot(sc->N, dIdy)) * sc->N - dIdy;
+		*domega_in_dx = (2 * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
+		*domega_in_dy = (2 * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
 #endif
 		
 		float3 T, B;
@@ -114,7 +121,7 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 		if(dot(Ng, *omega_in) > 0.0f)
 		{
 			// common terms for pdf and eval
-			float cosNI = dot(sc->N, *omega_in);
+			float cosNI = dot(bsdf->N, *omega_in);
 			// make sure the direction we chose is still in the right hemisphere
 			if(cosNI > 0)
 			{
@@ -122,13 +129,14 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 				float common = 0.5f * M_1_PI_F * cosp;
 				*pdf = (m_exponent + 1) * common;
 				float out = cosNI * (m_exponent + 2) * common;
-				*eval = bsdf_phong_ramp_get_color(sc, colors, cosp) * out;
+				*eval = bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out;
 			}
 		}
 	}
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
+#endif /* __OSL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index 303f4c9ce34..1d21614ecee 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -37,9 +37,9 @@ CCL_NAMESPACE_BEGIN
 
 /* REFLECTION */
 
-ccl_device int bsdf_reflection_setup(ShaderClosure *sc)
+ccl_device int bsdf_reflection_setup(MicrofacetBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_REFLECTION_ID;
+	bsdf->type = CLOSURE_BSDF_REFLECTION_ID;
 	return SD_BSDF;
 }
 
@@ -55,7 +55,8 @@ ccl_device float3 bsdf_reflection_eval_transmit(const ShaderClosure *sc, const f
 
 ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float3 N = bsdf->N;
 
 	// only one direction is possible
 	float cosNO = dot(N, I);
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index c78a4b67134..050a4e76fa9 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -37,9 +37,9 @@ CCL_NAMESPACE_BEGIN
 
 /* REFRACTION */
 
-ccl_device int bsdf_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_refraction_setup(MicrofacetBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_REFRACTION_ID;
+	bsdf->type = CLOSURE_BSDF_REFRACTION_ID;
 	return SD_BSDF;
 }
 
@@ -55,8 +55,9 @@ ccl_device float3 bsdf_refraction_eval_transmit(const ShaderClosure *sc, const f
 
 ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float m_eta = sc->data0;
-	float3 N = sc->N;
+	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
+	float m_eta = bsdf->ior;
+	float3 N = bsdf->N;
 
 	float3 R, T;
 #ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index e5b6ab93a64..28e775bcbc8 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -35,17 +35,35 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct ToonBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float3 N;
+	float size;
+	float smooth;
+} ToonBsdf;
+
 /* DIFFUSE TOON */
 
-ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
-	sc->data0 = saturate(sc->data0);
-	sc->data1 = saturate(sc->data1);
+	bsdf->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
+	bsdf->size = saturate(bsdf->size);
+	bsdf->smooth = saturate(bsdf->smooth);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const ToonBsdf *bsdf_a = (const ToonBsdf*)a;
+	const ToonBsdf *bsdf_b = (const ToonBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N)) &&
+	       (bsdf_a->size == bsdf_b->size) &&
+		   (bsdf_a->smooth == bsdf_b->smooth);
+}
+
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
 	float is;
@@ -67,9 +85,10 @@ ccl_device float bsdf_toon_get_sample_angle(float max_angle, float smooth)
 
 ccl_device float3 bsdf_diffuse_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float max_angle = sc->data0*M_PI_2_F;
-	float smooth = sc->data1*M_PI_2_F;
-	float angle = safe_acosf(fmaxf(dot(sc->N, omega_in), 0.0f));
+	const ToonBsdf *bsdf = (const ToonBsdf*)sc;
+	float max_angle = bsdf->size*M_PI_2_F;
+	float smooth = bsdf->smooth*M_PI_2_F;
+	float angle = safe_acosf(fmaxf(dot(bsdf->N, omega_in), 0.0f));
 
 	float3 eval = bsdf_toon_get_intensity(max_angle, smooth, angle);
 	
@@ -90,21 +109,22 @@ ccl_device float3 bsdf_diffuse_toon_eval_transmit(const ShaderClosure *sc, const
 
 ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float max_angle = sc->data0*M_PI_2_F;
-	float smooth = sc->data1*M_PI_2_F;
+	const ToonBsdf *bsdf = (const ToonBsdf*)sc;
+	float max_angle = bsdf->size*M_PI_2_F;
+	float smooth = bsdf->smooth*M_PI_2_F;
 	float sample_angle = bsdf_toon_get_sample_angle(max_angle, smooth);
 	float angle = sample_angle*randu;
 
 	if(sample_angle > 0.0f) {
-		sample_uniform_cone(sc->N, sample_angle, randu, randv, omega_in, pdf);
+		sample_uniform_cone(bsdf->N, sample_angle, randu, randv, omega_in, pdf);
 
 		if(dot(Ng, *omega_in) > 0.0f) {
 			*eval = *pdf * bsdf_toon_get_intensity(max_angle, smooth, angle);
 
 #ifdef __RAY_DIFFERENTIALS__
 			// TODO: find a better approximation for the bounce
-			*domega_in_dx = (2.0f * dot(sc->N, dIdx)) * sc->N - dIdx;
-			*domega_in_dy = (2.0f * dot(sc->N, dIdy)) * sc->N - dIdy;
+			*domega_in_dx = (2.0f * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
+			*domega_in_dy = (2.0f * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
 #endif
 		}
 		else
@@ -117,25 +137,26 @@ ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, floa
 
 /* GLOSSY TOON */
 
-ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
+ccl_device int bsdf_glossy_toon_setup(ToonBsdf *bsdf)
 {
-	sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
-	sc->data0 = saturate(sc->data0);
-	sc->data1 = saturate(sc->data1);
+	bsdf->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
+	bsdf->size = saturate(bsdf->size);
+	bsdf->smooth = saturate(bsdf->smooth);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float max_angle = sc->data0*M_PI_2_F;
-	float smooth = sc->data1*M_PI_2_F;
-	float cosNI = dot(sc->N, omega_in);
-	float cosNO = dot(sc->N, I);
+	const ToonBsdf *bsdf = (const ToonBsdf*)sc;
+	float max_angle = bsdf->size*M_PI_2_F;
+	float smooth = bsdf->smooth*M_PI_2_F;
+	float cosNI = dot(bsdf->N, omega_in);
+	float cosNO = dot(bsdf->N, I);
 	
 	if(cosNI > 0 && cosNO > 0) {
 		/* reflect the view vector */
-		float3 R = (2 * cosNO) * sc->N - I;
+		float3 R = (2 * cosNO) * bsdf->N - I;
 		float cosRI = dot(R, omega_in);
 
 		float angle = safe_acosf(fmaxf(cosRI, 0.0f));
@@ -157,13 +178,14 @@ ccl_device float3 bsdf_glossy_toon_eval_transmit(const ShaderClosure *sc, const
 
 ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float max_angle = sc->data0*M_PI_2_F;
-	float smooth = sc->data1*M_PI_2_F;
-	float cosNO = dot(sc->N, I);
+	const ToonBsdf *bsdf = (const ToonBsdf*)sc;
+	float max_angle = bsdf->size*M_PI_2_F;
+	float smooth = bsdf->smooth*M_PI_2_F;
+	float cosNO = dot(bsdf->N, I);
 	
 	if(cosNO > 0) {
 		/* reflect the view vector */
-		float3 R = (2 * cosNO) * sc->N - I;
+		float3 R = (2 * cosNO) * bsdf->N - I;
 
 		float sample_angle = bsdf_toon_get_sample_angle(max_angle, smooth);
 		float angle = sample_angle*randu;
@@ -171,15 +193,15 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float
 		sample_uniform_cone(R, sample_angle, randu, randv, omega_in, pdf);
 
 		if(dot(Ng, *omega_in) > 0.0f) {
-			float cosNI = dot(sc->N, *omega_in);
+			float cosNI = dot(bsdf->N, *omega_in);
 
 			/* make sure the direction we chose is still in the right hemisphere */
 			if(cosNI > 0) {
 				*eval = *pdf * bsdf_toon_get_intensity(max_angle, smooth, angle);
 
 #ifdef __RAY_DIFFERENTIALS__
-				*domega_in_dx = (2 * dot(sc->N, dIdx)) * sc->N - dIdx;
-				*domega_in_dy = (2 * dot(sc->N, dIdy)) * sc->N - dIdy;
+				*domega_in_dx = (2 * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
+				*domega_in_dy = (2 * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
 #endif
 			}
 			else
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index c24720cefbe..35c95768b69 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -19,6 +19,17 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct Bssrdf {
+	SHADER_CLOSURE_BASE;
+
+	float radius;
+	float sharpness;
+	float d;
+	float texture_blur;
+	float albedo;
+	float3 N;
+} Bssrdf;
+
 /* Planar Truncated Gaussian
  *
  * Note how this is different from the typical gaussian, this one integrates
@@ -28,11 +39,12 @@ CCL_NAMESPACE_BEGIN
 /* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
 #define GAUSS_TRUNCATE 12.46f
 
-ccl_device float bssrdf_gaussian_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_eval(const ShaderClosure *sc, float r)
 {
 	/* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
 	 * = 1 - exp(-Rm*Rm/(2*v)) */
-	const float v = sc->data0*sc->data0*(0.25f*0.25f);
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f);
 	const float Rm = sqrtf(v*GAUSS_TRUNCATE);
 
 	if(r >= Rm)
@@ -41,7 +53,7 @@ ccl_device float bssrdf_gaussian_eval(ShaderClosure *sc, float r)
 	return expf(-r*r/(2.0f*v))/(2.0f*M_PI_F*v);
 }
 
-ccl_device float bssrdf_gaussian_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_pdf(const ShaderClosure *sc, float r)
 {
 	/* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
 	const float area_truncated = 1.0f - expf(-0.5f*GAUSS_TRUNCATE);
@@ -49,12 +61,12 @@ ccl_device float bssrdf_gaussian_pdf(ShaderClosure *sc, float r)
 	return bssrdf_gaussian_eval(sc, r) * (1.0f/(area_truncated));
 }
 
-ccl_device void bssrdf_gaussian_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_gaussian_sample(const ShaderClosure *sc, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
 	 * r = sqrt(-2*v*logf(xi)) */
-
-	const float v = sc->data0*sc->data0*(0.25f*0.25f);
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f);
 	const float Rm = sqrtf(v*GAUSS_TRUNCATE);
 
 	/* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
@@ -75,12 +87,13 @@ ccl_device void bssrdf_gaussian_sample(ShaderClosure *sc, float xi, float *r, fl
  * far as I can tell has no closed form solution. So we get an iterative solution
  * instead with newton-raphson. */
 
-ccl_device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r)
 {
-	const float sharpness = sc->T.x;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float sharpness = bssrdf->sharpness;
 
 	if(sharpness == 0.0f) {
-		const float Rm = sc->data0;
+		const float Rm = bssrdf->radius;
 
 		if(r >= Rm)
 			return 0.0f;
@@ -94,7 +107,7 @@ ccl_device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
 
 	}
 	else {
-		float Rm = sc->data0*(1.0f + sharpness);
+		float Rm = bssrdf->radius*(1.0f + sharpness);
 
 		if(r >= Rm)
 			return 0.0f;
@@ -122,13 +135,13 @@ ccl_device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
 	}
 }
 
-ccl_device float bssrdf_cubic_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 {
 	return bssrdf_cubic_eval(sc, r);
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -155,12 +168,13 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 	return x;
 }
 
-ccl_device void bssrdf_cubic_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_cubic_sample(const ShaderClosure *sc, float xi, float *r, float *h)
 {
-	float Rm = sc->data0;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float sharpness = bssrdf->sharpness;
+	float Rm = bssrdf->radius;
 	float r_ = bssrdf_cubic_quintic_root_find(xi);
 
-	const float sharpness = sc->T.x;
 	if(sharpness != 0.0f) {
 		r_ = powf(r_, 1.0f + sharpness);
 		Rm *= (1.0f + sharpness);
@@ -198,21 +212,22 @@ ccl_device_inline float bssrdf_burley_compatible_mfp(float r)
 	return 0.25f * M_1_PI_F * r;
 }
 
-ccl_device void bssrdf_burley_setup(ShaderClosure *sc)
+ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
 {
 	/* Mean free path length. */
-	const float l = bssrdf_burley_compatible_mfp(sc->data0);
+	const float l = bssrdf_burley_compatible_mfp(bssrdf->radius);
 	/* Surface albedo. */
-	const float A = sc->data2;
+	const float A = bssrdf->albedo;
 	const float s = bssrdf_burley_fitting(A);
 	const float d = l / s;
 
-	sc->custom1 = d;
+	bssrdf->d = d;
 }
 
-ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_burley_eval(const ShaderClosure *sc, float r)
 {
-	const float d = sc->custom1;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float d = bssrdf->d;
 	const float Rm = BURLEY_TRUNCATE * d;
 
 	if(r >= Rm)
@@ -231,7 +246,7 @@ ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r)
 	return (exp_r_d + exp_r_3_d) / (4.0f*d);
 }
 
-ccl_device float bssrdf_burley_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
 {
 	return bssrdf_burley_eval(sc, r) * (1.0f/BURLEY_TRUNCATE_CDF);
 }
@@ -240,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(ShaderClosure *sc, float r)
  * Returns scaled radius, meaning the result is to be scaled up by d.
  * Since there's no closed form solution we do Newton-Raphson method to find it.
  */
-ccl_device float bssrdf_burley_root_find(float xi)
+ccl_device_inline float bssrdf_burley_root_find(float xi)
 {
 	const float tolerance = 1e-6f;
 	const int max_iteration_count = 10;
@@ -276,12 +291,13 @@ ccl_device float bssrdf_burley_root_find(float xi)
 	return r;
 }
 
-ccl_device void bssrdf_burley_sample(ShaderClosure *sc,
+ccl_device void bssrdf_burley_sample(const ShaderClosure *sc,
                                      float xi,
                                      float *r,
                                      float *h)
 {
-	const float d = sc->custom1;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float d = bssrdf->d;
 	const float Rm = BURLEY_TRUNCATE * d;
 	const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
 
@@ -295,26 +311,29 @@ ccl_device void bssrdf_burley_sample(ShaderClosure *sc,
  *
  * Samples distributed over disk with no falloff, for reference. */
 
-ccl_device float bssrdf_none_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_eval(const ShaderClosure *sc, float r)
 {
-	const float Rm = sc->data0;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float Rm = bssrdf->radius;
 	return (r < Rm)? 1.0f: 0.0f;
 }
 
-ccl_device float bssrdf_none_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_pdf(const ShaderClosure *sc, float r)
 {
 	/* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-	const float Rm = sc->data0;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float Rm = bssrdf->radius;
 	const float area = (M_PI_F*Rm*Rm);
 
 	return bssrdf_none_eval(sc, r) / area;
 }
 
-ccl_device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_none_sample(const ShaderClosure *sc, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
 	 * r = sqrt(xi)*Rm */
-	const float Rm = sc->data0;
+	const Bssrdf *bssrdf = (const Bssrdf*)sc;
+	const float Rm = bssrdf->radius;
 	const float r_ = sqrtf(xi)*Rm;
 
 	*r = r_;
@@ -325,30 +344,42 @@ ccl_device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float
 
 /* Generic */
 
-ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
+ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
+{
+	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
+
+	if(!bssrdf)
+		return NULL;
+
+	float sample_weight = fabsf(average(weight));
+	bssrdf->sample_weight = sample_weight;
+	return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
+}
+
+ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 {
-	if(sc->data0 < BSSRDF_MIN_RADIUS) {
+	if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
-		sc->data0 = 0.0f;
-		sc->data1 = 0.0f;
-		int flag = bsdf_diffuse_setup(sc);
-		sc->type = CLOSURE_BSDF_BSSRDF_ID;
+		DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
+		bsdf->N = bssrdf->N;
+		int flag = bsdf_diffuse_setup(bsdf);
+		bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
 		return flag;
 	}
 	else {
-		sc->data1 = saturate(sc->data1); /* texture blur */
-		sc->T.x = saturate(sc->T.x); /* sharpness */
-		sc->type = type;
+		bssrdf->texture_blur = saturate(bssrdf->texture_blur);
+		bssrdf->sharpness = saturate(bssrdf->sharpness);
+		bssrdf->type = type;
 
 		if(type == CLOSURE_BSSRDF_BURLEY_ID) {
-			bssrdf_burley_setup(sc);
+			bssrdf_burley_setup(bssrdf);
 		}
 
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
 	}
 }
 
-ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		bssrdf_cubic_sample(sc, xi, r, h);
@@ -358,7 +389,7 @@ ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
 		bssrdf_burley_sample(sc, xi, r, h);
 }
 
-ccl_device float bssrdf_pdf(ShaderClosure *sc, float r)
+ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		return bssrdf_cubic_pdf(sc, r);
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 4d71ba50ec3..01e67c7c2fd 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -19,6 +19,12 @@
 
 CCL_NAMESPACE_BEGIN
 
+typedef ccl_addr_space struct HenyeyGreensteinVolume {
+	SHADER_CLOSURE_BASE;
+
+	float g;
+} HenyeyGreensteinVolume;
+
 /* HENYEY-GREENSTEIN CLOSURE */
 
 /* Given cosine between rays, return probability density that a photon bounces
@@ -29,19 +35,28 @@ ccl_device float single_peaked_henyey_greenstein(float cos_theta, float g)
 	return ((1.0f - g * g) / safe_powf(1.0f + g * g - 2.0f * g * cos_theta, 1.5f)) * (M_1_PI_F * 0.25f);
 };
 
-ccl_device int volume_henyey_greenstein_setup(ShaderClosure *sc)
+ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
 {
-	sc->type = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID;
+	volume->type = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID;
 	
 	/* clamp anisotropy to avoid delta function */
-	sc->data0 = signf(sc->data0) * min(fabsf(sc->data0), 1.0f - 1e-3f);
+	volume->g = signf(volume->g) * min(fabsf(volume->g), 1.0f - 1e-3f);
 
 	return SD_SCATTER;
 }
 
+ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume*)a;
+	const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume*)b;
+
+	return (volume_a->g == volume_b->g);
+}
+
 ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, const float3 I, float3 omega_in, float *pdf)
 {
-	float g = sc->data0;
+	const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc;
+	float g = volume->g;
 
 	/* note that I points towards the viewer */
 	if(fabsf(g) < 1e-3f) {
@@ -58,7 +73,8 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, c
 ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
 	float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float g = sc->data0;
+	const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc;
+	float g = volume->g;
 	float cos_phi, sin_phi, cos_theta;
 
 	/* match pdf for small g */
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index d2c7edb11ea..11548324e18 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -17,7 +17,9 @@
 
 #include "geom_attribute.h"
 #include "geom_object.h"
+#include "geom_patch.h"
 #include "geom_triangle.h"
+#include "geom_subd_triangle.h"
 #include "geom_triangle_intersect.h"
 #include "geom_motion_triangle.h"
 #include "geom_motion_curve.h"
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index c7364e9edac..8604d30ad34 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -25,47 +25,76 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
+ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+
+ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __HAIR__
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+		return ATTR_PRIM_CURVE;
+	}
+	else
+#endif
+	if(subd_triangle_patch(kg, sd) != ~0) {
+		return ATTR_PRIM_SUBD;
+	}
+	else {
+		return ATTR_PRIM_TRIANGLE;
+	}
+}
+
+ccl_device_inline AttributeDescriptor attribute_not_found()
+{
+	const AttributeDescriptor desc = {ATTR_ELEMENT_NONE, (NodeAttributeType)0, 0, ATTR_STD_NOT_FOUND};
+	return desc;
+}
+
 /* Find attribute based on ID */
 
-ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
+ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE)
-		return (int)ATTR_STD_NOT_FOUND;
+	if(ccl_fetch(sd, object) == PRIM_NONE) {
+		return attribute_not_found();
+	}
 
 	/* for SVM, find attribute by unique id */
 	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
-#ifdef __HAIR__
-	attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
-#endif
+	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
 	while(attr_map.x != id) {
 		if(UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
-			return ATTR_STD_NOT_FOUND;
+			return attribute_not_found();
 		}
 		attr_offset += ATTR_PRIM_TYPES;
 		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	}
 
-	*elem = (AttributeElement)attr_map.y;
+	AttributeDescriptor desc;
+	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
-		return ATTR_STD_NOT_FOUND;
+	if(ccl_fetch(sd, prim) == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH) {
+		return attribute_not_found();
+	}
 
 	/* return result */
-	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
+	desc.offset = (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
+	desc.type = (NodeAttributeType)(attr_map.w & 0xff);
+	desc.flags = (AttributeFlag)(attr_map.w >> 8);
+
+	return desc;
 }
 
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderData *sd, int offset)
+ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc)
 {
 	Transform tfm;
 
-	tfm.x = kernel_tex_fetch(__attributes_float3, offset + 0);
-	tfm.y = kernel_tex_fetch(__attributes_float3, offset + 1);
-	tfm.z = kernel_tex_fetch(__attributes_float3, offset + 2);
-	tfm.w = kernel_tex_fetch(__attributes_float3, offset + 3);
+	tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
+	tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
+	tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+	tfm.w = kernel_tex_fetch(__attributes_float3, desc.offset + 3);
 
 	return tfm;
 }
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 292e1bfca0e..aa9cd295452 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -24,23 +24,23 @@ CCL_NAMESPACE_BEGIN
 
 /* Reading attributes on various curve elements */
 
-ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	if(elem == ATTR_ELEMENT_CURVE) {
+	if(desc.element == ATTR_ELEMENT_CURVE) {
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
 	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
 		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
 		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
-		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
-		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
+		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
+		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
@@ -59,9 +59,9 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 	}
 }
 
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	if(elem == ATTR_ELEMENT_CURVE) {
+	if(desc.element == ATTR_ELEMENT_CURVE) {
 		/* idea: we can't derive any useful differentials here, but for tiled
 		 * mipmap image caching it would be useful to avoid reading the highest
 		 * detail level always. maybe a derivative based on the hair density
@@ -71,15 +71,15 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
 	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
 		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
 		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 2fb8e219884..dabba3fb1f0 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -387,6 +387,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(
 	float t, u, v;
 
 	if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
+		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
+			if(ss_isect->hits[i].t == t) {
+				return;
+			}
+		}
+
 		ss_isect->num_hits++;
 
 		int hit;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index c0d15a95954..883c5dc100d 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -292,6 +292,18 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
 		*numverts = __float_as_int(f.w);
 }
 
+/* Offset to an objects patch map */
+
+ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0;
+
+	int offset = object*OBJECT_SIZE + 11;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return __float_as_uint(f.x);
+}
+
 /* Pass ID for shader */
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
new file mode 100644
index 00000000000..6a0ff5a4a04
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -0,0 +1,343 @@
+/*
+ * Based on code from OpenSubdiv released under this license:
+ *
+ * Copyright 2013 Pixar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "Apache License")
+ * with the following modification; you may not use this file except in
+ * compliance with the Apache License and the following modification to it:
+ * Section 6. Trademarks. is deleted and replaced with:
+ *
+ * 6. Trademarks. This License does not grant permission to use the trade
+ *   names, trademarks, service marks, or product names of the Licensor
+ *   and its affiliates, except as required to comply with Section 4(c) of
+ *   the License and to reproduce the content of the NOTICE file.
+ *
+ * You may obtain a copy of the Apache License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Apache License with the above modification is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the Apache License for the specific
+ * language governing permissions and limitations under the Apache License.
+ *
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct PatchHandle {
+	int array_index, patch_index, vert_index;
+} PatchHandle;
+
+ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *v)
+{
+	int quadrant = -1;
+
+	if(*u < median) {
+		if(*v < median) {
+			quadrant = 0;
+		}
+		else {
+			quadrant = 1;
+			*v -= median;
+		}
+	}
+	else {
+		if(*v < median) {
+			quadrant = 3;
+		}
+		else {
+			quadrant = 2;
+			*v -= median;
+		}
+		*u -= median;
+	}
+
+	return quadrant;
+}
+
+/* retrieve PatchHandle from patch coords */
+
+ccl_device_inline PatchHandle patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+{
+	PatchHandle handle;
+
+	kernel_assert((u >= 0.0f) && (u <= 1.0f) && (v >= 0.0f) && (v <= 1.0f));
+
+	int node = (object_patch_map_offset(kg, object) + patch)/2;
+	float median = 0.5f;
+
+	for(int depth = 0; depth < 0xff; depth++) {
+		float delta = median * 0.5f;
+
+		int quadrant = patch_map_resolve_quadrant(median, &u, &v);
+		kernel_assert(quadrant >= 0);
+
+		uint child = kernel_tex_fetch(__patches, node + quadrant);
+
+		/* is the quadrant a hole? */
+		if(!(child & PATCH_MAP_NODE_IS_SET)) {
+			handle.array_index = -1;
+			return handle;
+		}
+
+		uint index = child & PATCH_MAP_NODE_INDEX_MASK;
+
+		if(child & PATCH_MAP_NODE_IS_LEAF) {
+			handle.array_index = kernel_tex_fetch(__patches, index + 0);
+			handle.patch_index = kernel_tex_fetch(__patches, index + 1);
+			handle.vert_index = kernel_tex_fetch(__patches, index + 2);
+
+			return handle;
+		} else {
+			node = index;
+		}
+
+		median = delta;
+	}
+
+	/* no leaf found */
+	kernel_assert(0);
+
+	handle.array_index = -1;
+	return handle;
+}
+
+ccl_device_inline void patch_eval_bspline_weights(float t, float *point, float *deriv)
+{
+	/* The four uniform cubic B-Spline basis functions evaluated at t */
+	float inv_6 = 1.0f / 6.0f;
+
+	float t2 = t * t;
+	float t3 = t * t2;
+
+	point[0] = inv_6 * (1.0f - 3.0f*(t - t2) - t3);
+	point[1] = inv_6 * (4.0f - 6.0f*t2 + 3.0f*t3);
+	point[2] = inv_6 * (1.0f + 3.0f*(t + t2 - t3));
+	point[3] = inv_6 * t3;
+
+	/* Derivatives of the above four basis functions at t */
+	deriv[0] = -0.5f*t2 + t - 0.5f;
+	deriv[1] =  1.5f*t2 - 2.0f*t;
+	deriv[2] = -1.5f*t2 + t + 0.5f;
+	deriv[3] =  0.5f*t2;
+}
+
+ccl_device_inline void patch_eval_adjust_boundary_weights(uint bits, float *s, float *t)
+{
+	int boundary = ((bits >> 8) & 0xf);
+
+	if(boundary & 1) {
+		t[2] -= t[0];
+		t[1] += 2*t[0];
+		t[0] = 0;
+	}
+
+	if(boundary & 2) {
+		s[1] -= s[3];
+		s[2] += 2*s[3];
+		s[3] = 0;
+	}
+
+	if(boundary & 4) {
+		t[1] -= t[3];
+		t[2] += 2*t[3];
+		t[3] = 0;
+	}
+
+	if(boundary & 8) {
+		s[2] -= s[0];
+		s[1] += 2*s[0];
+		s[0] = 0;
+	}
+}
+
+ccl_device_inline int patch_eval_depth(uint patch_bits)
+{
+	return (patch_bits & 0xf);
+}
+
+ccl_device_inline float patch_eval_param_fraction(uint patch_bits)
+{
+	bool non_quad_root = (patch_bits >> 4) & 0x1;
+	int depth = patch_eval_depth(patch_bits);
+
+	if(non_quad_root) {
+		return 1.0f / (float)(1 << (depth-1));
+	}
+	else {
+		return 1.0f / (float)(1 << depth);
+	}
+}
+
+ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, float *v)
+{
+	float frac = patch_eval_param_fraction(patch_bits);
+
+	int iu = (patch_bits >> 22) & 0x3ff;
+	int iv = (patch_bits >> 12) & 0x3ff;
+
+	/* top left corner */
+	float pu = (float)iu*frac;
+	float pv = (float)iv*frac;
+
+	/* normalize uv coordinates */
+	*u = (*u - pu) / frac;
+	*v = (*v - pv) / frac;
+}
+
+/* retrieve patch control indices */
+
+ccl_device_inline int patch_eval_indices(KernelGlobals *kg, const PatchHandle *handle, int channel,
+                                         int indices[PATCH_MAX_CONTROL_VERTS])
+{
+	int index_base = kernel_tex_fetch(__patches, handle->array_index + 2) + handle->vert_index;
+
+	/* XXX: regular patches only */
+	for(int i = 0; i < 16; i++) {
+		indices[i] = kernel_tex_fetch(__patches, index_base + i);
+	}
+
+	return 16;
+}
+
+/* evaluate patch basis functions */
+
+ccl_device_inline void patch_eval_basis(KernelGlobals *kg, const PatchHandle *handle, float u, float v,
+                                float weights[PATCH_MAX_CONTROL_VERTS],
+                                float weights_du[PATCH_MAX_CONTROL_VERTS],
+                                float weights_dv[PATCH_MAX_CONTROL_VERTS])
+{
+	uint patch_bits = kernel_tex_fetch(__patches, handle->patch_index + 1); /* read patch param */
+	float d_scale = 1 << patch_eval_depth(patch_bits);
+
+	bool non_quad_root = (patch_bits >> 4) & 0x1;
+	if(non_quad_root) {
+		d_scale *= 0.5f;
+	}
+
+	patch_eval_normalize_coords(patch_bits, &u, &v);
+
+	/* XXX: regular patches only for now. */
+
+	float s[4], t[4], ds[4], dt[4];
+
+	patch_eval_bspline_weights(u, s, ds);
+	patch_eval_bspline_weights(v, t, dt);
+
+	patch_eval_adjust_boundary_weights(patch_bits, s, t);
+	patch_eval_adjust_boundary_weights(patch_bits, ds, dt);
+
+	for(int k = 0; k < 4; k++) {
+		for(int l = 0; l < 4; l++) {
+			weights[4*k+l] = s[l] * t[k];
+			weights_du[4*k+l] = ds[l] * t[k] * d_scale;
+			weights_dv[4*k+l] = s[l] * dt[k] * d_scale;
+		}
+	}
+}
+
+/* generic function for evaluating indices and weights from patch coords */
+
+ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg, int object, int patch, float u, float v, int channel,
+                                        int indices[PATCH_MAX_CONTROL_VERTS],
+                                        float weights[PATCH_MAX_CONTROL_VERTS],
+                                        float weights_du[PATCH_MAX_CONTROL_VERTS],
+                                        float weights_dv[PATCH_MAX_CONTROL_VERTS])
+{
+	PatchHandle handle = patch_map_find_patch(kg, object, patch, u, v);
+	kernel_assert(handle.array_index >= 0);
+
+	int num_control = patch_eval_indices(kg, &handle, channel, indices);
+	patch_eval_basis(kg, &handle, u, v, weights, weights_du, weights_dv);
+
+	return num_control;
+}
+
+/* functions for evaluating attributes on patches */
+
+ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int offset,
+                                  int patch, float u, float v, int channel,
+                                  float *du, float* dv)
+{
+	int indices[PATCH_MAX_CONTROL_VERTS];
+	float weights[PATCH_MAX_CONTROL_VERTS];
+	float weights_du[PATCH_MAX_CONTROL_VERTS];
+	float weights_dv[PATCH_MAX_CONTROL_VERTS];
+
+	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	                                           indices, weights, weights_du, weights_dv);
+
+	float val = 0.0f;
+	if(du) *du = 0.0f;
+	if(dv) *dv = 0.0f;
+
+	for(int i = 0; i < num_control; i++) {
+		float v = kernel_tex_fetch(__attributes_float, offset + indices[i]);
+
+		val += v * weights[i];
+		if(du) *du += v * weights_du[i];
+		if(dv) *dv += v * weights_dv[i];
+	}
+
+	return val;
+}
+
+ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int offset,
+                                    int patch, float u, float v, int channel,
+                                    float3 *du, float3 *dv)
+{
+	int indices[PATCH_MAX_CONTROL_VERTS];
+	float weights[PATCH_MAX_CONTROL_VERTS];
+	float weights_du[PATCH_MAX_CONTROL_VERTS];
+	float weights_dv[PATCH_MAX_CONTROL_VERTS];
+
+	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	                                           indices, weights, weights_du, weights_dv);
+
+	float3 val = make_float3(0.0f, 0.0f, 0.0f);
+	if(du) *du = make_float3(0.0f, 0.0f, 0.0f);
+	if(dv) *dv = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int i = 0; i < num_control; i++) {
+		float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+
+		val += v * weights[i];
+		if(du) *du += v * weights_du[i];
+		if(dv) *dv += v * weights_dv[i];
+	}
+
+	return val;
+}
+
+ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int offset,
+                                    int patch, float u, float v, int channel,
+                                    float3 *du, float3 *dv)
+{
+	int indices[PATCH_MAX_CONTROL_VERTS];
+	float weights[PATCH_MAX_CONTROL_VERTS];
+	float weights_du[PATCH_MAX_CONTROL_VERTS];
+	float weights_dv[PATCH_MAX_CONTROL_VERTS];
+
+	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	                                           indices, weights, weights_du, weights_dv);
+
+	float3 val = make_float3(0.0f, 0.0f, 0.0f);
+	if(du) *du = make_float3(0.0f, 0.0f, 0.0f);
+	if(dv) *dv = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int i = 0; i < num_control; i++) {
+		float3 v = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, offset + indices[i]));
+
+		val += v * weights[i];
+		if(du) *du += v * weights_du[i];
+		if(dv) *dv += v * weights_dv[i];
+	}
+
+	return val;
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index b1b1e919e00..4384c2093e9 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -23,19 +23,25 @@ CCL_NAMESPACE_BEGIN
 
 /* Generic primitive attribute reading functions */
 
-ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
+                                                  const ShaderData *sd,
+                                                  const AttributeDescriptor desc,
+                                                  float *dx, float *dy)
 {
 	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
-		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
+		if(subd_triangle_patch(kg, sd) == ~0)
+			return triangle_attribute_float(kg, sd, desc, dx, dy);
+		else
+			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
 	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
+		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
-		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 	else {
@@ -45,19 +51,25 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
 	}
 }
 
-ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    const AttributeDescriptor desc,
+                                                    float3 *dx, float3 *dy)
 {
 	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
-		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
+		if(subd_triangle_patch(kg, sd) == ~0)
+			return triangle_attribute_float3(kg, sd, desc, dx, dy);
+		else
+			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
 	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
+		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
-		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 	else {
@@ -69,15 +81,14 @@ ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData
 
 /* Default UV coordinate */
 
-ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 {
-	AttributeElement elem_uv;
-	int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv);
+	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
-	if(offset_uv == ATTR_STD_NOT_FOUND)
+	if(desc.offset == ATTR_STD_NOT_FOUND)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	float3 uv = primitive_attribute_float3(kg, sd, elem_uv, offset_uv, NULL, NULL);
+	float3 uv = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 	uv.z = 1.0f;
 	return uv;
 }
@@ -87,15 +98,14 @@ ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
 	/* storing ptex data as attributes is not memory efficient but simple for tests */
-	AttributeElement elem_face_id, elem_uv;
-	int offset_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID, &elem_face_id);
-	int offset_uv = find_attribute(kg, sd, ATTR_STD_PTEX_UV, &elem_uv);
+	const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
+	const AttributeDescriptor desc_uv = find_attribute(kg, sd, ATTR_STD_PTEX_UV);
 
-	if(offset_face_id == ATTR_STD_NOT_FOUND || offset_uv == ATTR_STD_NOT_FOUND)
+	if(desc_face_id.offset == ATTR_STD_NOT_FOUND || desc_uv.offset == ATTR_STD_NOT_FOUND)
 		return false;
 
-	float3 uv3 = primitive_attribute_float3(kg, sd, elem_uv, offset_uv, NULL, NULL);
-	float face_id_f = primitive_attribute_float(kg, sd, elem_face_id, offset_face_id, NULL, NULL);
+	float3 uv3 = primitive_attribute_float3(kg, sd, desc_uv, NULL, NULL);
+	float face_id_f = primitive_attribute_float(kg, sd, desc_face_id, NULL, NULL);
 
 	*uv = make_float2(uv3.x, uv3.y);
 	*face_id = (int)face_id_f;
@@ -117,11 +127,10 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 #endif
 
 	/* try to create spherical tangent from generated coordinates */
-	AttributeElement attr_elem;
-	int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED, &attr_elem);
+	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_GENERATED);
 
-	if(attr_offset != ATTR_STD_NOT_FOUND) {
-		float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
+	if(desc.offset != ATTR_STD_NOT_FOUND) {
+		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
 		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
@@ -138,7 +147,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 {
 	/* center position */
 	float3 center;
@@ -158,19 +167,18 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 motion_pre = center, motion_post = center;
 
 	/* deformation motion */
-	AttributeElement elem;
-	int offset = find_attribute(kg, sd, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_MOTION_VERTEX_POSITION);
 
-	if(offset != ATTR_STD_NOT_FOUND) {
+	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
 		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
-		int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
-		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
+		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
 		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
new file mode 100644
index 00000000000..fccacf435f9
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -0,0 +1,349 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions for retrieving attributes on triangles produced from subdivision meshes */
+
+CCL_NAMESPACE_BEGIN
+
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+{
+	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+}
+
+/* UV coords of triangle within patch */
+
+ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
+{
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+
+	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
+	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
+	uv[2] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.z);
+}
+
+/* Vertex indices of patch */
+
+ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+{
+	uint4 indices;
+
+	indices.x = kernel_tex_fetch(__patches, patch+0);
+	indices.y = kernel_tex_fetch(__patches, patch+1);
+	indices.z = kernel_tex_fetch(__patches, patch+2);
+	indices.w = kernel_tex_fetch(__patches, patch+3);
+
+	return indices;
+}
+
+/* Originating face for patch */
+
+ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+{
+	return kernel_tex_fetch(__patches, patch+4);
+}
+
+/* Number of corners on originating face */
+
+ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+{
+	return kernel_tex_fetch(__patches, patch+5) & 0xffff;
+}
+
+/* Indices of the four corners that are used by the patch */
+
+ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+{
+	uint4 data;
+
+	data.x = kernel_tex_fetch(__patches, patch+4);
+	data.y = kernel_tex_fetch(__patches, patch+5);
+	data.z = kernel_tex_fetch(__patches, patch+6);
+	data.w = kernel_tex_fetch(__patches, patch+7);
+
+	int num_corners = data.y & 0xffff;
+
+	if(num_corners == 4) {
+		/* quad */
+		corners[0] = data.z;
+		corners[1] = data.z+1;
+		corners[2] = data.z+2;
+		corners[3] = data.z+3;
+	}
+	else {
+		/* ngon */
+		int c = data.y >> 16;
+
+		corners[0] = data.z + c;
+		corners[1] = data.z + mod(c+1, num_corners);
+		corners[2] = data.w;
+		corners[3] = data.z + mod(c-1, num_corners);
+	}
+}
+
+/* Reading attributes on various subdivision triangle elements */
+
+ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+{
+	int patch = subd_triangle_patch(kg, sd);
+
+	if(desc.flags & ATTR_SUBDIVIDED) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		float2 dpdu = uv[0] - uv[2];
+		float2 dpdv = uv[1] - uv[2];
+
+		/* p is [s, t] */
+		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+
+		float a, dads, dadt;
+		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx || dy) {
+			float dsdu = dpdu.x;
+			float dtdu = dpdu.y;
+			float dsdv = dpdv.x;
+			float dtdv = dpdv.y;
+
+			if(dx) {
+				float dudx = ccl_fetch(sd, du).dx;
+				float dvdx = ccl_fetch(sd, dv).dx;
+
+				float dsdx = dsdu*dudx + dsdv*dvdx;
+				float dtdx = dtdu*dudx + dtdv*dvdx;
+
+				*dx = dads*dsdx + dadt*dtdx;
+			}
+			if(dy) {
+				float dudy = ccl_fetch(sd, du).dy;
+				float dvdy = ccl_fetch(sd, dv).dy;
+
+				float dsdy = dsdu*dudy + dsdv*dvdy;
+				float dtdy = dtdu*dudy + dtdv*dvdy;
+
+				*dy = dads*dsdy + dadt*dtdy;
+			}
+		}
+#endif
+
+		return a;
+	}
+	else if(desc.element == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return kernel_tex_fetch(__attributes_float, desc.offset + subd_triangle_patch_face(kg, patch));
+	}
+	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		uint4 v = subd_triangle_patch_indices(kg, patch);
+
+		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + v.x);
+		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + v.y);
+		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + v.z);
+		float f3 = kernel_tex_fetch(__attributes_float, desc.offset + v.w);
+
+		if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+			f1 = (f1+f0)*0.5f;
+			f3 = (f3+f0)*0.5f;
+		}
+
+		float a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+		float b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
+		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+#endif
+
+		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+	}
+	else if(desc.element == ATTR_ELEMENT_CORNER) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		int corners[4];
+		subd_triangle_patch_corners(kg, patch, corners);
+
+		float f0 = kernel_tex_fetch(__attributes_float, corners[0] + desc.offset);
+		float f1 = kernel_tex_fetch(__attributes_float, corners[1] + desc.offset);
+		float f2 = kernel_tex_fetch(__attributes_float, corners[2] + desc.offset);
+		float f3 = kernel_tex_fetch(__attributes_float, corners[3] + desc.offset);
+
+		if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+			f1 = (f1+f0)*0.5f;
+			f3 = (f3+f0)*0.5f;
+		}
+
+		float a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+		float b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
+		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+#endif
+
+		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+	}
+	else {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return 0.0f;
+	}
+}
+
+ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
+{
+	int patch = subd_triangle_patch(kg, sd);
+
+	if(desc.flags & ATTR_SUBDIVIDED) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		float2 dpdu = uv[0] - uv[2];
+		float2 dpdv = uv[1] - uv[2];
+
+		/* p is [s, t] */
+		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+
+		float3 a, dads, dadt;
+
+		if(desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+			a = patch_eval_uchar4(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
+		}
+		else {
+			a = patch_eval_float3(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
+		}
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx || dy) {
+			float dsdu = dpdu.x;
+			float dtdu = dpdu.y;
+			float dsdv = dpdv.x;
+			float dtdv = dpdv.y;
+
+			if(dx) {
+				float dudx = ccl_fetch(sd, du).dx;
+				float dvdx = ccl_fetch(sd, dv).dx;
+
+				float dsdx = dsdu*dudx + dsdv*dvdx;
+				float dtdx = dtdu*dudx + dtdv*dvdx;
+
+				*dx = dads*dsdx + dadt*dtdx;
+			}
+			if(dy) {
+				float dudy = ccl_fetch(sd, du).dy;
+				float dvdy = ccl_fetch(sd, dv).dy;
+
+				float dsdy = dsdu*dudy + dsdv*dvdy;
+				float dtdy = dtdu*dudy + dtdv*dvdy;
+
+				*dy = dads*dsdy + dadt*dtdy;
+			}
+		}
+#endif
+
+		return a;
+	}
+	else if(desc.element == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+	}
+	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		uint4 v = subd_triangle_patch_indices(kg, patch);
+
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
+		float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+
+		if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+			f1 = (f1+f0)*0.5f;
+			f3 = (f3+f0)*0.5f;
+		}
+
+		float3 a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+		float3 b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
+		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+#endif
+
+		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+	}
+	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		int corners[4];
+		subd_triangle_patch_corners(kg, patch, corners);
+
+		float3 f0, f1, f2, f3;
+
+		if(desc.element == ATTR_ELEMENT_CORNER) {
+			f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
+			f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
+			f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
+			f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+		}
+		else {
+			f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[0] + desc.offset));
+			f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[1] + desc.offset));
+			f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[2] + desc.offset));
+			f3 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset));
+		}
+
+		if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+			f1 = (f1+f0)*0.5f;
+			f3 = (f3+f0)*0.5f;
+		}
+
+		float3 a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+		float3 b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
+		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+#endif
+
+		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+	}
+	else {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 0c2351e1d1b..d3289d6572c 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -105,20 +105,20 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_spa
 
 /* Reading attributes on various triangle elements */
 
-ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	if(elem == ATTR_ELEMENT_FACE) {
+	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
 	}
-	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
+	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
 		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
-		float f0 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.x);
-		float f1 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.y);
-		float f2 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.z);
+		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
+		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
+		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
@@ -127,8 +127,8 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 
 		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + ccl_fetch(sd, prim)*3;
+	else if(desc.element == ATTR_ELEMENT_CORNER) {
+		int tri = desc.offset + ccl_fetch(sd, prim)*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
@@ -148,20 +148,20 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 	}
 }
 
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	if(elem == ATTR_ELEMENT_FACE) {
+	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
 	}
-	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
+	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
 		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
@@ -170,11 +170,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 
 		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
-	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = offset + ccl_fetch(sd, prim)*3;
+	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+		int tri = desc.offset + ccl_fetch(sd, prim)*3;
 		float3 f0, f1, f2;
 
-		if(elem == ATTR_ELEMENT_CORNER) {
+		if(desc.element == ATTR_ELEMENT_CORNER) {
 			f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
 			f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
 			f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index caa6c9d9a5b..dd5328220ab 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -342,9 +342,16 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
 	float3 qvec = cross(tvec, edge1);
 	float3 pvec = cross(D, edge2);
-	float rt = dot(edge2, qvec) / dot(edge1, pvec);
-
-	P = P + D*rt;
+	float det = dot(edge1, pvec);
+	if(det != 0.0f) {
+		/* If determinant is zero it means ray lies in the plane of
+		 * the triangle. It is possible in theory due to watertight
+		 * nature of triangle intersection. For such cases we simply
+		 * don't refine intersection hoping it'll go all fine.
+		 */
+		float rt = dot(edge2, qvec) / det;
+		P = P + D*rt;
+	}
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
@@ -400,9 +407,16 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
 	float3 qvec = cross(tvec, edge1);
 	float3 pvec = cross(D, edge2);
-	float rt = dot(edge2, qvec) / dot(edge1, pvec);
-
-	P = P + D*rt;
+	float det = dot(edge1, pvec);
+	if(det != 0.0f) {
+		/* If determinant is zero it means ray lies in the plane of
+		 * the triangle. It is possible in theory due to watertight
+		 * nature of triangle intersection. For such cases we simply
+		 * don't refine intersection hoping it'll go all fine.
+		 */
+		float rt = dot(edge2, qvec) / det;
+		P = P + D*rt;
+	}
 #endif  /* __INTERSECTION_REFINE__ */
 
 	if(isect->object != OBJECT_NONE) {
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 2044aafc877..efe540a8518 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -44,40 +44,41 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 }
 #endif  /* __KERNEL_GPU__ */
 
-ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P)
+ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    float3 P)
 {
 	/* todo: optimize this so it's just a single matrix multiplication when
 	 * possible (not motion blur), or perhaps even just translation + scale */
-	AttributeElement attr_elem;
-	int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM, &attr_elem);
+	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM);
 
 	object_inverse_position_transform(kg, sd, &P);
 
-	if(attr_offset != ATTR_STD_NOT_FOUND) {
-		Transform tfm = primitive_attribute_matrix(kg, sd, attr_offset);
+	if(desc.offset != ATTR_STD_NOT_FOUND) {
+		Transform tfm = primitive_attribute_matrix(kg, sd, desc);
 		P = transform_point(&tfm, P);
 	}
 
 	return P;
 }
 
-ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy)
+ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
 #  if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
 	float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
 	float4 r = make_float4(f, f, f, 1.0);
 #  else
-	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
 #  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
-		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+		r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
 	else
-		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+		r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = 0.0f;
@@ -86,22 +87,22 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 	return average(float4_to_float3(r));
 }
 
-ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy)
+ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
 #  if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
 	float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
 #  else
-	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
 #  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
-		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+		r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
 	else
-		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+		r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 9ee0b09529e..bfbf73df54f 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -18,8 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BAKING__
 
-ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
-                                   int pass_filter, int sample)
+ccl_device_inline void compute_light_pass(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          PathRadiance *L,
+                                          RNG rng,
+                                          int pass_filter,
+                                          int sample)
 {
 	/* initialize master radiance accumulator */
 	kernel_assert(kernel_data.film.use_light_pass);
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index f6c103d59dd..88514de514c 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -211,7 +211,10 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
+ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
+                                              float raster_x, float raster_y,
+                                              float lens_u, float lens_v,
+                                              ccl_addr_space Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -303,8 +306,12 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 
 /* Common */
 
-ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-	float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
+ccl_device_inline void camera_sample(KernelGlobals *kg,
+                                     int x, int y,
+                                     float filter_u, float filter_v,
+                                     float lens_u, float lens_v,
+                                     float time,
+                                     ccl_addr_space Ray *ray)
 {
 	/* pixel filter */
 	int filter_table_offset = kernel_data.film.filter_table_offset;
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 08f6f457805..063220b542e 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -36,13 +36,18 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
-#define ccl_device_inline  __device__ __inline__
+#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#  define ccl_device_inline  __device__ __forceinline__
+#else
+#  define ccl_device_inline  __device__ __inline__
+#endif
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+#define ccl_align(n) __align__(n)
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 8505cb85576..2ae89dde7c4 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -40,6 +40,7 @@
 #define ccl_local __local
 #define ccl_private __private
 #define ccl_restrict restrict
+#define ccl_align(n) __attribute__((aligned(n)))
 
 #ifdef __SPLIT_KERNEL__
 #  define ccl_addr_space __global
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 93c4bd3f7d5..1e829eaa1fa 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -44,11 +44,11 @@ typedef struct LightSample {
  *
  * Note: light_p is modified when sample_coord is true.
  */
-ccl_device float area_light_sample(float3 P,
-                                   float3 *light_p,
-                                   float3 axisu, float3 axisv,
-                                   float randu, float randv,
-                                   bool sample_coord)
+ccl_device_inline float area_light_sample(float3 P,
+                                          float3 *light_p,
+                                          float3 axisu, float3 axisv,
+                                          float randu, float randv,
+                                          bool sample_coord)
 {
 	/* In our name system we're using P for the center,
 	 * which is o in the paper.
@@ -268,11 +268,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals
 	return false;
 }
 
-ccl_device float background_portal_pdf(KernelGlobals *kg,
-                                       float3 P,
-                                       float3 direction,
-                                       int ignore_portal,
-                                       bool *is_possible)
+ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
+                                              float3 P,
+                                              float3 direction,
+                                              int ignore_portal,
+                                              bool *is_possible)
 {
 	float portal_pdf = 0.0f;
 
@@ -367,7 +367,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
+                                                 float3 P,
+                                                 float randu, float randv,
+                                                 float *pdf)
 {
 	/* Probability of sampling portals instead of the map. */
 	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
@@ -507,8 +510,11 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
 	return t*t/cos_pi;
 }
 
-ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
-	float randu, float randv, float3 P, LightSample *ls)
+ccl_device_inline void lamp_light_sample(KernelGlobals *kg,
+                                         int lamp,
+                                         float randu, float randv,
+                                         float3 P,
+                                         LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index d5b31037723..903be4f09a0 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -435,8 +435,12 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 }
 
 #ifdef __SUBSURFACE__
-
-ccl_device bool kernel_path_subsurface_scatter(
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+bool kernel_path_subsurface_scatter(
         KernelGlobals *kg,
         ShaderData *sd,
         ShaderData *emission_sd,
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 56516967d8f..64f1468eacf 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -18,13 +18,13 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BRANCHED_PATH__
 
-ccl_device void kernel_branched_path_ao(KernelGlobals *kg,
-                                        ShaderData *sd,
-                                        ShaderData *emission_sd,
-                                        PathRadiance *L,
-                                        PathState *state,
-                                        RNG *rng,
-                                        float3 throughput)
+ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               ShaderData *emission_sd,
+                                               PathRadiance *L,
+                                               PathState *state,
+                                               RNG *rng,
+                                               float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
 	float num_samples_inv = 1.0f/num_samples;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 74b1ae0ca32..250b8e92a45 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -222,8 +222,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 #endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
-	ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
+ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
+                                           ccl_addr_space RNG *rng,
+                                           ShaderData *sd,
+                                           ccl_addr_space float3 *throughput,
+                                           ccl_addr_space PathState *state,
+                                           PathRadiance *L,
+                                           ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
 	if(ccl_fetch(sd, flag) & SD_BSDF) {
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index e45522a4641..5fd4f2fad4c 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -18,8 +18,14 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME_SCATTER__
 
-ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_volume_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        PathState *state,
+        PathRadiance *L)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 8be6742699a..3437d83ed7d 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -130,7 +130,10 @@ ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float w
 	return make_float2(u, v);
 }
 
-ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height)
+ccl_device_inline float3 fisheye_equisolid_to_direction(float u, float v,
+                                                        float lens,
+                                                        float fov,
+                                                        float width, float height)
 {
 	u = (u - 0.5f) * width;
 	v = (v - 0.5f) * height;
@@ -189,7 +192,7 @@ ccl_device float2 direction_to_mirrorball(float3 dir)
 	return make_float2(u, v);
 }
 
-ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
+ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
@@ -205,7 +208,7 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 	}
 }
 
-ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
+ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
@@ -221,9 +224,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 	}
 }
 
-ccl_device float3 spherical_stereo_position(KernelGlobals *kg,
-                                            float3 dir,
-                                            float3 pos)
+ccl_device_inline float3 spherical_stereo_position(KernelGlobals *kg,
+                                                   float3 dir,
+                                                   float3 pos)
 {
 	float interocular_offset = kernel_data.cam.interocular_offset;
 
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 765baa2a5ba..079bea30bdd 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -24,6 +24,7 @@
  *
  */
 
+#include "closure/alloc.h"
 #include "closure/bsdf_util.h"
 #include "closure/bsdf.h"
 #include "closure/emissive.h"
@@ -148,8 +149,16 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 /* ShaderData setup from BSSRDF scatter */
 
 #ifdef __SUBSURFACE__
-ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd,
-	const Intersection *isect, const Ray *ray)
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+void shader_setup_from_subsurface(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        const Intersection *isect,
+        const Ray *ray)
 {
 	bool backfacing = sd->flag & SD_BACKFACING;
 
@@ -225,14 +234,14 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 
 /* ShaderData setup from position sampled on mesh */
 
-ccl_device void shader_setup_from_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         const float3 P,
-                                         const float3 Ng,
-                                         const float3 I,
-                                         int shader, int object, int prim,
-                                         float u, float v, float t,
-                                         float time)
+ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader, int object, int prim,
+                                                float u, float v, float t,
+                                                float time)
 {
 	/* vectors */
 	ccl_fetch(sd, P) = P;
@@ -444,7 +453,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 /* Merging */
 
 #if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device void shader_merge_closures(ShaderData *sd)
+ccl_device_inline void shader_merge_closures(ShaderData *sd)
 {
 	/* merge identical closures, better when we sample a single closure at a time */
 	for(int i = 0; i < sd->num_closure; i++) {
@@ -453,22 +462,9 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 		for(int j = i + 1; j < sd->num_closure; j++) {
 			ShaderClosure *scj = &sd->closure[j];
 
-#ifdef __OSL__
-			if(sci->prim || scj->prim)
+			if(sci->type != scj->type)
 				continue;
-#endif
-
-			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1 && sci->data2 == scj->data2))
-				continue;
-
-			if(CLOSURE_IS_BSDF_OR_BSSRDF(sci->type)) {
-				if(sci->N != scj->N)
-					continue;
-				else if(CLOSURE_IS_BSDF_ANISOTROPIC(sci->type) && sci->T != scj->T)
-					continue;
-			}
-
-			if((sd->flag & SD_BSDF_HAS_CUSTOM) && !(sci->custom1 == scj->custom1 && sci->custom2 == scj->custom2 && sci->custom3 == scj->custom3))
+			if(!bsdf_merge(sci, scj))
 				continue;
 
 			sci->weight += scj->weight;
@@ -542,12 +538,18 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
 }
 #endif
 
-ccl_device void shader_bsdf_eval(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 const float3 omega_in,
-                                 BsdfEval *eval,
-                                 float light_pdf,
-                                 bool use_mis)
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+void shader_bsdf_eval(KernelGlobals *kg,
+                      ShaderData *sd,
+                      const float3 omega_in,
+                      BsdfEval *eval,
+                      float light_pdf,
+                      bool use_mis)
 {
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
@@ -566,9 +568,13 @@ ccl_device void shader_bsdf_eval(KernelGlobals *kg,
 	}
 }
 
-ccl_device int shader_bsdf_sample(KernelGlobals *kg, ShaderData *sd,
-	float randu, float randv, BsdfEval *bsdf_eval,
-	float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float randu, float randv,
+                                         BsdfEval *bsdf_eval,
+                                         float3 *omega_in,
+                                         differential3 *domega_in,
+                                         float *pdf)
 {
 	int sampled = 0;
 
@@ -741,8 +747,9 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
 			eval += sc->weight*ao_factor;
-			N += sc->N*average(sc->weight);
+			N += bsdf->N*average(sc->weight);
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
@@ -759,6 +766,7 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	return eval;
 }
 
+#ifdef __SUBSURFACE__
 ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -769,11 +777,12 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
+			const Bssrdf *bssrdf = (const Bssrdf*)sc;
 			float avg_weight = fabsf(average(sc->weight));
 
-			N += sc->N*avg_weight;
+			N += bssrdf->N*avg_weight;
 			eval += sc->weight;
-			texture_blur += sc->data1*avg_weight;
+			texture_blur += bssrdf->texture_blur*avg_weight;
 			weight_sum += avg_weight;
 		}
 	}
@@ -786,6 +795,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	
 	return eval;
 }
+#endif
 
 /* Emission */
 
@@ -831,6 +841,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
 {
 	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, num_closure_extra) = 0;
 	ccl_fetch(sd, randb_closure) = randb;
 
 #ifdef __OSL__
@@ -861,33 +872,33 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
 {
 	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, num_closure_extra) = 0;
 	ccl_fetch(sd, randb_closure) = 0.0f;
 
+#ifdef __SVM__
 #ifdef __OSL__
 	if(kg->osl) {
-		return OSLShader::eval_background(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_background(kg, sd, state, path_flag, ctx);
 	}
 	else
 #endif
-
 	{
-#ifdef __SVM__
 		svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
+	}
 
-		float3 eval = make_float3(0.0f, 0.0f, 0.0f);
+	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-		for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
-			if(CLOSURE_IS_BACKGROUND(sc->type))
-				eval += sc->weight;
-		}
+		if(CLOSURE_IS_BACKGROUND(sc->type))
+			eval += sc->weight;
+	}
 
-		return eval;
+	return eval;
 #else
-		return make_float3(0.8f, 0.8f, 0.8f);
+	return make_float3(0.8f, 0.8f, 0.8f);
 #endif
-	}
 }
 
 /* Volume */
@@ -998,12 +1009,17 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 /* Volume Evaluation */
 
-ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
-	PathState *state, VolumeStack *stack, int path_flag, ShaderContext ctx)
+ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          PathState *state,
+                                          VolumeStack *stack,
+                                          int path_flag,
+                                          ShaderContext ctx)
 {
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
 	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
 	sd->flag = 0;
 
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
@@ -1051,6 +1067,7 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
 {
 	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, num_closure_extra) = 0;
 	ccl_fetch(sd, randb_closure) = 0.0f;
 
 	/* this will modify sd->P */
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index d1576754d2e..db2fc84834a 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -75,12 +75,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 		}
 
 		uint num_hits;
-		if(max_hits == 0) {
-			blocked = true;
-			num_hits = 0;
-		} else {
-			blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
-		}
+		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
 
 		/* if no opaque surface found but we did find transparent hits, shade them */
 		if(!blocked && num_hits > 0) {
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index b048bd38fc9..ba45eea6388 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -85,7 +85,16 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
 	return NULL;
 }
 
-ccl_device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all)
+#ifndef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device_inline
+#endif
+float3 subsurface_scatter_eval(ShaderData *sd,
+                               ShaderClosure *sc,
+                               float disk_r,
+                               float r,
+                               bool all)
 {
 #ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
@@ -140,24 +149,21 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
 	sd->randb_closure = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
 
 	if(hit) {
-		ShaderClosure *sc = &sd->closure[0];
-		sd->num_closure = 1;
-
-		sc->weight = weight;
-		sc->sample_weight = 1.0f;
-		sc->data0 = 0.0f;
-		sc->data1 = 0.0f;
-		sc->N = N;
-		sd->flag |= bsdf_diffuse_setup(sc);
-
-		/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-		 * can recognize it as not being a regular diffuse closure */
-		sc->type = CLOSURE_BSDF_BSSRDF_ID;
+		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+		if(bsdf) {
+			bsdf->N = N;
+			sd->flag |= bsdf_diffuse_setup(bsdf);
+
+			/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+			 * can recognize it as not being a regular diffuse closure */
+			bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		}
 	}
-	else
-		sd->num_closure = 0;
 }
 
 /* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
@@ -217,7 +223,12 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 /* Subsurface scattering step, from a point on the surface to other
  * nearby points on the same object.
  */
-ccl_device int subsurface_scatter_multi_intersect(
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
         SubsurfaceIntersection* ss_isect,
         ShaderData *sd,
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 5ba262c1044..7d6fec02331 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -41,11 +41,16 @@ KERNEL_TEX(float4, texture_float4, __objects_vector)
 KERNEL_TEX(uint, texture_uint, __tri_shader)
 KERNEL_TEX(float4, texture_float4, __tri_vnormal)
 KERNEL_TEX(uint4, texture_uint4, __tri_vindex)
+KERNEL_TEX(uint, texture_uint, __tri_patch)
+KERNEL_TEX(float2, texture_float2, __tri_patch_uv)
 
 /* curves */
 KERNEL_TEX(float4, texture_float4, __curves)
 KERNEL_TEX(float4, texture_float4, __curve_keys)
 
+/* patches */
+KERNEL_TEX(uint, texture_uint, __patches)
+
 /* attributes */
 KERNEL_TEX(uint4, texture_uint4, __attributes_map)
 KERNEL_TEX(float, texture_float, __attributes_float)
@@ -173,9 +178,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
 
 #  else
 /* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index a9be2ae717a..f3b10c21b9d 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -34,7 +34,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* constants */
-#define OBJECT_SIZE 		11
+#define OBJECT_SIZE 		12
 #define OBJECT_VECTOR_SIZE	6
 #define LIGHT_SIZE			5
 #define FILTER_TABLE_SIZE	1024
@@ -573,8 +573,13 @@ typedef enum PrimitiveType {
 
 /* Attributes */
 
-#define ATTR_PRIM_TYPES		2
-#define ATTR_PRIM_CURVE		1
+typedef enum AttributePrimitive {
+	ATTR_PRIM_TRIANGLE = 0,
+	ATTR_PRIM_CURVE,
+	ATTR_PRIM_SUBD,
+
+	ATTR_PRIM_TYPES
+} AttributePrimitive;
 
 typedef enum AttributeElement {
 	ATTR_ELEMENT_NONE,
@@ -619,6 +624,18 @@ typedef enum AttributeStandard {
 	ATTR_STD_NOT_FOUND = ~0
 } AttributeStandard;
 
+typedef enum AttributeFlag {
+	ATTR_FINAL_SIZE = (1 << 0),
+	ATTR_SUBDIVIDED = (1 << 1),
+} AttributeFlag;
+
+typedef struct AttributeDescriptor {
+	AttributeElement element;
+	NodeAttributeType type;
+	uint flags; /* see enum AttributeFlag */
+	int offset;
+} AttributeDescriptor;
+
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
@@ -631,33 +648,26 @@ typedef enum AttributeStandard {
 #  define MAX_CLOSURE 1
 #endif
 
-/* This struct is to be 16 bytes aligned, we also keep some extra precautions:
- * - All the float3 members are in the beginning of the struct, so compiler
- *   does not put own padding trying to align this members.
- * - We make sure OSL pointer is also 16 bytes aligned.
- */
-typedef ccl_addr_space struct ShaderClosure {
-	float3 weight;
-	float3 N;
-	float3 T;
-
-	ClosureType type;
-	float sample_weight;
-	float data0;
-	float data1;
-	float data2;
-
-	/* Following fields could be used to store pre-calculated
-	 * values by various BSDF closures for more effective sampling
-	 * and evaluation.
-	 */
-	float custom1;
-	float custom2;
-	float custom3;
+/* This struct is the base class for all closures. The common members are
+ * duplicated in all derived classes since we don't have C++ in the kernel
+ * yet, and because it lets us lay out the members to minimize padding. The
+ * weight member is located at the beginning of the struct for this reason.
+ *
+ * ShaderClosure has a fixed size, and any extra space must be allocated
+ * with closure_alloc_extra().
+ *
+ * We pad the struct to 80 bytes and ensure it is aligned to 16 bytes, which
+ * we assume to be the maximum required alignment for any struct. */
 
-#ifdef __OSL__
-	void *prim, *pad4;
-#endif
+#define SHADER_CLOSURE_BASE \
+	float3 weight; \
+	ClosureType type; \
+	float sample_weight \
+
+typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
+	SHADER_CLOSURE_BASE;
+
+	float data[14]; /* pad to 80 bytes */
 } ShaderClosure;
 
 /* Shader Context
@@ -692,11 +702,10 @@ enum ShaderDataFlag {
 	SD_AO              = (1 << 8),   /* have ao closure? */
 	SD_TRANSPARENT     = (1 << 9),  /* have transparent closure? */
 	SD_BSDF_NEEDS_LCG  = (1 << 10),
-	SD_BSDF_HAS_CUSTOM = (1 << 11), /* are the custom variables relevant? */
 
 	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF|
 	                    SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO|
-	                    SD_BSDF_NEEDS_LCG|SD_BSDF_HAS_CUSTOM),
+	                    SD_BSDF_NEEDS_LCG),
 
 	/* shader flags */
 	SD_USE_MIS                = (1 << 12),  /* direct light sample */
@@ -729,94 +738,97 @@ enum ShaderDataFlag {
 	                   SD_OBJECT_INTERSECTS_VOLUME)
 };
 
-struct KernelGlobals;
-
 #ifdef __SPLIT_KERNEL__
 #  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
 #  if defined(__SPLIT_KERNEL_AOS__)
      /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_fetch(s, t) (s[SD_THREAD].t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].t[index])
+#    define ccl_soa_member(type, name) type soa_##name
+#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
+#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
 #  else
      /* ShaderData is stored as an Structure-of-Arrays */
 #    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
 #    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
 #    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(t) +  SD_FIELD_SIZE(t) * SD_THREAD - SD_OFFSETOF(t)))->t)
+#    define ccl_soa_member(type, name) type soa_##name
+#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
 #    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
 #  endif
 #else
+#  define ccl_soa_member(type, name) type name
 #  define ccl_fetch(s, t) (s->t)
 #  define ccl_fetch_array(s, t, index) (&s->t[index])
 #endif
 
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	float3 P;
+	ccl_soa_member(float3, P);
 	/* smooth normal for shading */
-	float3 N;
+	ccl_soa_member(float3, N);
 	/* true geometric normal */
-	float3 Ng;
+	ccl_soa_member(float3, Ng);
 	/* view/incoming direction */
-	float3 I;
+	ccl_soa_member(float3, I);
 	/* shader id */
-	int shader;
+	ccl_soa_member(int, shader);
 	/* booleans describing shader, see ShaderDataFlag */
-	int flag;
+	ccl_soa_member(int, flag);
 
 	/* primitive id if there is one, ~0 otherwise */
-	int prim;
+	ccl_soa_member(int, prim);
 
 	/* combined type and curve segment for hair */
-	int type;
+	ccl_soa_member(int, type);
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	float u;
-	float v;
+	ccl_soa_member(float, u);
+	ccl_soa_member(float, v);
 	/* object id if there is one, ~0 otherwise */
-	int object;
+	ccl_soa_member(int, object);
 
 	/* motion blur sample time */
-	float time;
+	ccl_soa_member(float, time);
 
 	/* length of the ray being shaded */
-	float ray_length;
+	ccl_soa_member(float, ray_length);
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	differential3 dP;
+	ccl_soa_member(differential3, dP);
 	/* differential of I */
-	differential3 dI;
+	ccl_soa_member(differential3, dI);
 	/* differential of u, v */
-	differential du;
-	differential dv;
+	ccl_soa_member(differential, du);
+	ccl_soa_member(differential, dv);
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	float3 dPdu;
-	float3 dPdv;
+	ccl_soa_member(float3, dPdu);
+	ccl_soa_member(float3, dPdv);
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	Transform ob_tfm;
-	Transform ob_itfm;
+	ccl_soa_member(Transform, ob_tfm);
+	ccl_soa_member(Transform, ob_itfm);
 #endif
 
 	/* Closure data, we store a fixed array of closures */
-	struct ShaderClosure closure[MAX_CLOSURE];
-	int num_closure;
-	float randb_closure;
+	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
+	ccl_soa_member(int, num_closure);
+	ccl_soa_member(int, num_closure_extra);
+	ccl_soa_member(float, randb_closure);
+	ccl_soa_member(float3, svm_closure_weight);
 
 	/* LCG state for closures that require additional random numbers. */
-	uint lcg_state;
+	ccl_soa_member(uint, lcg_state);
 
 	/* ray start position, only set for backgrounds */
-	float3 ray_P;
-	differential3 ray_dP;
+	ccl_soa_member(float3, ray_P);
+	ccl_soa_member(differential3, ray_dP);
 
 #ifdef __OSL__
 	struct KernelGlobals * osl_globals;
@@ -1234,6 +1246,16 @@ enum RayState {
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
 
+/* Patches */
+
+#define PATCH_MAX_CONTROL_VERTS 16
+
+/* Patch map node flags */
+
+#define PATCH_MAP_NODE_IS_SET (1 << 30)
+#define PATCH_MAP_NODE_IS_LEAF (1 << 31)
+#define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF))
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index bf8301fe5fb..9dafed9afd1 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -36,7 +36,11 @@ typedef struct VolumeShaderCoefficients {
 } VolumeShaderCoefficients;
 
 /* evaluate shader to get extinction coefficient at P */
-ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, float3 *extinction)
+ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       PathState *state,
+                                                       float3 P,
+                                                       float3 *extinction)
 {
 	sd->P = P;
 	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
@@ -58,7 +62,11 @@ ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *s
 }
 
 /* evaluate shader to get absorption, scattering and emission at P */
-ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, VolumeShaderCoefficients *coeff)
+ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            PathState *state,
+                                            float3 P,
+                                            VolumeShaderCoefficients *coeff)
 {
 	sd->P = P;
 	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
@@ -1029,7 +1037,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	int stack_index = 0, enclosed_index = 0;
 
 #ifdef __VOLUME_RECORD_ALL__
-	Intersection hits[2*VOLUME_STACK_SIZE];
+	Intersection hits[2*VOLUME_STACK_SIZE + 1];
 	uint num_hits = scene_intersect_volume_all(kg,
 	                                           &volume_ray,
 	                                           hits,
@@ -1199,7 +1207,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
 	Ray volume_ray = *ray;
 
 #  ifdef __VOLUME_RECORD_ALL__
-	Intersection hits[2*VOLUME_STACK_SIZE];
+	Intersection hits[2*VOLUME_STACK_SIZE + 1];
 	uint num_hits = scene_intersect_volume_all(kg,
 	                                           &volume_ray,
 	                                           hits,
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 9cf4f2d759a..98de40e5a8a 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -25,7 +25,6 @@ set(SRC
 )
 
 set(HEADER_SRC
-	osl_bssrdf.h
 	osl_closures.h
 	osl_globals.h
 	osl_services.h
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 85fa7b34bcc..d835f9be45c 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -36,6 +36,9 @@
 
 #include "osl_closures.h"
 
+#include "kernel_compat_cpu.h"
+#include "closure/alloc.h"
+
 CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
@@ -48,7 +51,10 @@ using namespace OSL;
 ///
 class GenericBackgroundClosure : public CClosurePrimitive {
 public:
-	GenericBackgroundClosure() : CClosurePrimitive(Background) {}
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
+	{
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, weight);
+	}
 };
 
 /// Holdout closure
@@ -60,7 +66,11 @@ public:
 ///
 class HoldoutClosure : CClosurePrimitive {
 public:
-	HoldoutClosure () : CClosurePrimitive(Holdout) {}
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
+	{
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, weight);
+		sd->flag |= SD_HOLDOUT;
+	}
 };
 
 /// ambient occlusion closure
@@ -71,7 +81,11 @@ public:
 ///
 class AmbientOcclusionClosure : public CClosurePrimitive {
 public:
-	AmbientOcclusionClosure () : CClosurePrimitive(AmbientOcclusion) {}
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
+	{
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, weight);
+		sd->flag |= SD_AO;
+	}
 };
 
 ClosureParam *closure_background_params()
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index b5c0d76cf37..bc26f42b559 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -39,6 +39,7 @@
 
 #include "kernel_types.h"
 #include "kernel_montecarlo.h"
+#include "closure/alloc.h"
 #include "closure/bsdf_diffuse_ramp.h"
 
 CCL_NAMESPACE_BEGIN
@@ -47,51 +48,30 @@ using namespace OSL;
 
 class DiffuseRampClosure : public CBSDFClosure {
 public:
+	DiffuseRampBsdf params;
 	Color3 colors[8];
-	float3 fcolors[8];
 
-	DiffuseRampClosure() : CBSDFClosure(LABEL_DIFFUSE)
-	{}
-
-	void setup()
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
 	{
-		sc.prim = this;
-		m_shaderdata_flag = bsdf_diffuse_ramp_setup(&sc);
+	    DiffuseRampBsdf *bsdf = (DiffuseRampBsdf*)bsdf_alloc_osl(sd, sizeof(DiffuseRampBsdf), weight, &params);
 
-		for(int i = 0; i < 8; i++)
-			fcolors[i] = TO_FLOAT3(colors[i]);
-	}
+		if(bsdf) {
+			bsdf->colors = (float3*)closure_alloc_extra(sd, sizeof(float3)*8);
 
-	void blur(float roughness)
-	{
-		bsdf_diffuse_ramp_blur(&sc, roughness);
-	}
+			if(bsdf->colors) {
+				for(int i = 0; i < 8; i++)
+					bsdf->colors[i] = TO_FLOAT3(colors[i]);
 
-	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		return bsdf_diffuse_ramp_eval_reflect(&sc, fcolors, omega_out, omega_in, &pdf);
-	}
-
-	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		return bsdf_diffuse_ramp_eval_transmit(&sc, fcolors, omega_out, omega_in, &pdf);
-	}
-
-	int sample(const float3 &Ng,
-	           const float3 &omega_out, const float3 &domega_out_dx, const float3 &domega_out_dy,
-	           float randu, float randv,
-	           float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy,
-	           float &pdf, float3 &eval) const
-	{
-		return bsdf_diffuse_ramp_sample(&sc, fcolors, Ng, omega_out, domega_out_dx, domega_out_dy,
-			randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf);
+				sd->flag |= bsdf_diffuse_ramp_setup(bsdf);
+			}
+		}
 	}
 };
 
 ClosureParam *closure_bsdf_diffuse_ramp_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(DiffuseRampClosure, sc.N),
+		CLOSURE_FLOAT3_PARAM(DiffuseRampClosure, params.N),
 		CLOSURE_COLOR_ARRAY_PARAM(DiffuseRampClosure, colors, 8),
 		CLOSURE_STRING_KEYPARAM(DiffuseRampClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(DiffuseRampClosure)
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index bc73d80cd78..14c7644936e 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -38,6 +38,7 @@
 #include "osl_closures.h"
 
 #include "kernel_types.h"
+#include "closure/alloc.h"
 #include "closure/bsdf_phong_ramp.h"
 
 CCL_NAMESPACE_BEGIN
@@ -46,52 +47,31 @@ using namespace OSL;
 
 class PhongRampClosure : public CBSDFClosure {
 public:
+	PhongRampBsdf params;
 	Color3 colors[8];
-	float3 fcolors[8];
 
-	PhongRampClosure() : CBSDFClosure(LABEL_GLOSSY)
-	{}
-
-	void setup()
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
 	{
-		sc.prim = this;
-		m_shaderdata_flag = bsdf_phong_ramp_setup(&sc);
+	    PhongRampBsdf *bsdf = (PhongRampBsdf*)bsdf_alloc_osl(sd, sizeof(PhongRampBsdf), weight, &params);
 
-		for(int i = 0; i < 8; i++)
-			fcolors[i] = TO_FLOAT3(colors[i]);
-	}
+		if(bsdf) {
+			bsdf->colors = (float3*)closure_alloc_extra(sd, sizeof(float3)*8);
 
-	void blur(float roughness)
-	{
-		bsdf_phong_ramp_blur(&sc, roughness);
-	}
+			if(bsdf->colors) {
+				for(int i = 0; i < 8; i++)
+					bsdf->colors[i] = TO_FLOAT3(colors[i]);
 
-	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		return bsdf_phong_ramp_eval_reflect(&sc, fcolors, omega_out, omega_in, &pdf);
-	}
-
-	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		return bsdf_phong_ramp_eval_transmit(&sc, fcolors, omega_out, omega_in, &pdf);
-	}
-
-	int sample(const float3 &Ng,
-	           const float3 &omega_out, const float3 &domega_out_dx, const float3 &domega_out_dy,
-	           float randu, float randv,
-	           float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy,
-	           float &pdf, float3 &eval) const
-	{
-		return bsdf_phong_ramp_sample(&sc, fcolors, Ng, omega_out, domega_out_dx, domega_out_dy,
-			randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf);
+				sd->flag |= bsdf_phong_ramp_setup(bsdf);
+			}
+		}
 	}
 };
 
 ClosureParam *closure_bsdf_phong_ramp_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(PhongRampClosure, sc.N),
-		CLOSURE_FLOAT_PARAM(PhongRampClosure, sc.data0),
+		CLOSURE_FLOAT3_PARAM(PhongRampClosure, params.N),
+		CLOSURE_FLOAT_PARAM(PhongRampClosure, params.exponent),
 		CLOSURE_COLOR_ARRAY_PARAM(PhongRampClosure, colors, 8),
 		CLOSURE_STRING_KEYPARAM(PhongRampClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(PhongRampClosure)
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index f91fd6e015c..3f13e08b302 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -36,7 +36,9 @@
 
 #include "osl_closures.h"
 
+#include "kernel_compat_cpu.h"
 #include "kernel_types.h"
+#include "closure/alloc.h"
 #include "closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
@@ -52,25 +54,10 @@ using namespace OSL;
 ///
 class GenericEmissiveClosure : public CClosurePrimitive {
 public:
-	GenericEmissiveClosure() : CClosurePrimitive(Emissive) { }
-
-	Color3 eval(const Vec3 &Ng, const Vec3 &omega_out) const
-	{
-		float3 result = emissive_simple_eval(TO_FLOAT3(Ng), TO_FLOAT3(omega_out));
-		return TO_COLOR3(result);
-	}
-
-	void sample(const Vec3 &Ng, float randu, float randv,
-	            Vec3 &omega_out, float &pdf) const
-	{
-		float3 omega_out_;
-		emissive_sample(TO_FLOAT3(Ng), randu, randv, &omega_out_, &pdf);
-		omega_out = TO_VEC3(omega_out_);
-	}
-
-	float pdf(const Vec3 &Ng, const Vec3 &omega_out) const
+	void setup(ShaderData *sd, int /* path_flag */, float3 weight)
 	{
-		return emissive_pdf(TO_FLOAT3(Ng), TO_FLOAT3(omega_out));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, weight);
+		sd->flag |= SD_EMISSION;
 	}
 };
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index da4afb138f6..3614717e28c 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -30,17 +30,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <OpenImageIO/fmath.h>
-
 #include <OSL/genclosure.h>
 
 #include "kernel_compat_cpu.h"
-#include "osl_bssrdf.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
 #include "kernel_montecarlo.h"
 
+#include "closure/alloc.h"
 #include "closure/bsdf_diffuse.h"
 #include "closure/bssrdf.h"
 
@@ -48,27 +46,83 @@ CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
+class CBSSRDFClosure : public CClosurePrimitive {
+public:
+	Bssrdf params;
+	float3 radius;
+	float3 albedo;
+
+	void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
+	{
+		float sample_weight = fabsf(average(weight));
+
+		/* disable in case of diffuse ancestor, can't see it well then and
+		 * adds considerably noise due to probabilities of continuing path
+		 * getting lower and lower */
+		if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+			radius = make_float3(0.0f, 0.0f, 0.0f);
+		}
+
+		if(sample_weight > CLOSURE_WEIGHT_CUTOFF) {
+			/* sharpness */
+			float sharpness = params.sharpness;
+			/* texture color blur */
+			float texture_blur = params.texture_blur;
+
+			/* create one closure per color channel */
+			Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f));
+			if(bssrdf) {
+				bssrdf->sample_weight = sample_weight;
+				bssrdf->radius = radius.x;
+				bssrdf->texture_blur = texture_blur;
+				bssrdf->albedo = albedo.x;
+				bssrdf->sharpness = sharpness;
+				bssrdf->N = params.N;
+				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+			}
+
+			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
+			if(bssrdf) {
+				bssrdf->sample_weight = sample_weight;
+				bssrdf->radius = radius.y;
+				bssrdf->texture_blur = texture_blur;
+				bssrdf->albedo = albedo.y;
+				bssrdf->sharpness = sharpness;
+				bssrdf->N = params.N;
+				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+			}
+
+			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
+			if(bssrdf) {
+				bssrdf->sample_weight = sample_weight;
+				bssrdf->radius = radius.z;
+				bssrdf->texture_blur = texture_blur;
+				bssrdf->albedo = albedo.z;
+				bssrdf->sharpness = sharpness;
+				bssrdf->N = params.N;
+				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+			}
+		}
+	}
+};
+
 /* Cubic */
 
 class CubicBSSRDFClosure : public CBSSRDFClosure {
 public:
-	CubicBSSRDFClosure()
-	{}
-
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		sc.type = CLOSURE_BSSRDF_CUBIC_ID;
-		sc.data0 = fabsf(average(radius));
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
 	}
 };
 
 ClosureParam *closure_bssrdf_cubic_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
+		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, params.N),
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
-		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
+		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.texture_blur),
+		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.sharpness),
 		CLOSURE_STRING_KEYPARAM(CubicBSSRDFClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
 	};
@@ -81,22 +135,18 @@ CCLOSURE_PREPARE(closure_bssrdf_cubic_prepare, CubicBSSRDFClosure)
 
 class GaussianBSSRDFClosure : public CBSSRDFClosure {
 public:
-	GaussianBSSRDFClosure()
-	{}
-
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		sc.type = CLOSURE_BSSRDF_GAUSSIAN_ID;
-		sc.data0 = fabsf(average(radius));
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
 	}
 };
 
 ClosureParam *closure_bssrdf_gaussian_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
+		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, params.N),
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
+		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, params.texture_blur),
 		CLOSURE_STRING_KEYPARAM(GaussianBSSRDFClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
 	};
@@ -109,22 +159,18 @@ CCLOSURE_PREPARE(closure_bssrdf_gaussian_prepare, GaussianBSSRDFClosure)
 
 class BurleyBSSRDFClosure : public CBSSRDFClosure {
 public:
-	BurleyBSSRDFClosure()
-	{}
-
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		sc.type = CLOSURE_BSSRDF_BURLEY_ID;
-		sc.data0 = fabsf(average(radius));
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
 	}
 };
 
 ClosureParam *closure_bssrdf_burley_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, sc.N),
+		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, params.N),
 		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, radius),
-		CLOSURE_FLOAT_PARAM(BurleyBSSRDFClosure, sc.data1),
+		CLOSURE_FLOAT_PARAM(BurleyBSSRDFClosure, params.texture_blur),
 		CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, albedo),
 		CLOSURE_STRING_KEYPARAM(BurleyBSSRDFClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(BurleyBSSRDFClosure)
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.h b/intern/cycles/kernel/osl/osl_bssrdf.h
deleted file mode 100644
index d81ecade543..00000000000
--- a/intern/cycles/kernel/osl/osl_bssrdf.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __OSL_BSSRDF_H__
-#define __OSL_BSSRDF_H__
-
-#include <OSL/oslclosure.h>
-#include <OSL/oslexec.h>
-#include <OSL/genclosure.h>
-
-#include "osl_closures.h"
-
-#include "kernel_types.h"
-
-#include "util_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CBSSRDFClosure : public CClosurePrimitive {
-public:
-	ShaderClosure sc;
-	float3 radius;
-	float3 albedo;
-
-	CBSSRDFClosure() : CClosurePrimitive(BSSRDF) { }
-	int scattering() const { return LABEL_DIFFUSE; }
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __OSL_BSSRDF_H__ */
-
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 02b1491489c..94de782dca0 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -46,6 +46,7 @@
 #include "kernel_montecarlo.h"
 #include "kernel_random.h"
 
+#include "closure/alloc.h"
 #include "closure/bsdf_util.h"
 #include "closure/bsdf_ashikhmin_velvet.h"
 #include "closure/bsdf_diffuse.h"
@@ -66,112 +67,112 @@ using namespace OSL;
 
 /* BSDF class definitions */
 
-BSDF_CLOSURE_CLASS_BEGIN(Diffuse, diffuse, diffuse, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(DiffuseClosure, sc.N),
+BSDF_CLOSURE_CLASS_BEGIN(Diffuse, diffuse, DiffuseBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(DiffuseClosure, params.N),
 BSDF_CLOSURE_CLASS_END(Diffuse, diffuse)
 
-BSDF_CLOSURE_CLASS_BEGIN(Translucent, translucent, translucent, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(TranslucentClosure, sc.N),
+BSDF_CLOSURE_CLASS_BEGIN(Translucent, translucent, DiffuseBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(TranslucentClosure, params.N),
 BSDF_CLOSURE_CLASS_END(Translucent, translucent)
 
-BSDF_CLOSURE_CLASS_BEGIN(OrenNayar, oren_nayar, oren_nayar, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(OrenNayarClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(OrenNayarClosure, sc.data0),
+BSDF_CLOSURE_CLASS_BEGIN(OrenNayar, oren_nayar, OrenNayarBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(OrenNayarClosure, params.N),
+	CLOSURE_FLOAT_PARAM(OrenNayarClosure, params.roughness),
 BSDF_CLOSURE_CLASS_END(OrenNayar, oren_nayar)
 
-BSDF_CLOSURE_CLASS_BEGIN(Reflection, reflection, reflection, LABEL_SINGULAR)
-	CLOSURE_FLOAT3_PARAM(ReflectionClosure, sc.N),
+BSDF_CLOSURE_CLASS_BEGIN(Reflection, reflection, MicrofacetBsdf, LABEL_SINGULAR)
+	CLOSURE_FLOAT3_PARAM(ReflectionClosure, params.N),
 BSDF_CLOSURE_CLASS_END(Reflection, reflection)
 
-BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR)
-	CLOSURE_FLOAT3_PARAM(RefractionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0),
+BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, MicrofacetBsdf, LABEL_SINGULAR)
+	CLOSURE_FLOAT3_PARAM(RefractionClosure, params.N),
+	CLOSURE_FLOAT_PARAM(RefractionClosure, params.ior),
 BSDF_CLOSURE_CLASS_END(Refraction, refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR)
+BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, ShaderClosure, LABEL_SINGULAR)
 BSDF_CLOSURE_CLASS_END(Transparent, transparent)
 
-BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(AshikhminVelvetClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0),
+BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, VelvetBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(AshikhminVelvetClosure, params.N),
+	CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, params.sigma),
 BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
 
-BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT)
-	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N),
-	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data1),
+BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, MicrofacetBsdf, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.N),
+	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.T),
+	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_x),
+	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_y),
 BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso)
 
-BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, diffuse_toon, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(DiffuseToonClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(DiffuseToonClosure, sc.data1),
+BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, ToonBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, params.N),
+	CLOSURE_FLOAT_PARAM(DiffuseToonClosure, params.size),
+	CLOSURE_FLOAT_PARAM(DiffuseToonClosure, params.smooth),
 BSDF_CLOSURE_CLASS_END(DiffuseToon, diffuse_toon)
 
-BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(GlossyToonClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1),
+BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, ToonBsdf, LABEL_GLOSSY)
+	CLOSURE_FLOAT3_PARAM(GlossyToonClosure, params.N),
+	CLOSURE_FLOAT_PARAM(GlossyToonClosure, params.size),
+	CLOSURE_FLOAT_PARAM(GlossyToonClosure, params.smooth),
 BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, MicrofacetBsdf, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, params.N),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, params.alpha_x),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N),
-	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, MicrofacetBsdf, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, params.N),
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, params.T),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, params.alpha_x),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, params.alpha_y),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, MicrofacetBsdf, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, params.N),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, params.alpha_x),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N),
-	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, MicrofacetBsdf, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, params.N),
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, params.T),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, params.alpha_x),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, params.alpha_y),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, MicrofacetBsdf, LABEL_GLOSSY|LABEL_TRANSMIT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, params.N),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, params.alpha_x),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, params.ior),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT)
-	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2),
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, MicrofacetBsdf, LABEL_GLOSSY|LABEL_TRANSMIT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, params.N),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, params.alpha_x),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, params.ior),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
+BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset),
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
-BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1),
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
+BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
+	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
+	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset),
 BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission)
 
-VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, LABEL_VOLUME_SCATTER)
-	CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, sc.data0),
+VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, HenyeyGreensteinVolume, LABEL_VOLUME_SCATTER)
+	CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g),
 VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
 
-VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, LABEL_SINGULAR)
+VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
 VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
 
 /* Registration */
@@ -258,69 +259,64 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		volume_absorption_params(), volume_absorption_prepare);
 }
 
-/* Multiscattering GGX closures */
-
-class MicrofacetMultiClosure : public CBSDFClosure {
-public:
-	float3 color;
-
-	/* Technically, the MultiGGX Glass closure may also transmit.
-	 * However, since this is set statically and only used for caustic flags, this is probably as good as it gets. */
-	MicrofacetMultiClosure() : CBSDFClosure(LABEL_GLOSSY|LABEL_REFLECT)
-	{
-	}
+/* BSDF Closure */
 
-	void setup()
-	{
-		sc.prim = NULL;
-		sc.custom1 = color.x;
-		sc.custom2 = color.y;
-		sc.custom3 = color.z;
+bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
+{
+	/* caustic options */
+	if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
+		KernelGlobals *kg = sd->osl_globals;
+
+		if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+		   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT)))
+		{
+			return true;
+		}
 	}
 
-	void blur(float roughness)
-	{
-	}
+	return false;
+}
 
-	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		pdf = 0.0f;
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
+/* Multiscattering GGX closures */
 
-	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const
-	{
-		pdf = 0.0f;
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
+class MicrofacetMultiClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
 
-	int sample(const float3 &Ng,
-	           const float3 &omega_out, const float3 &domega_out_dx, const float3 &domega_out_dy,
-	           float randu, float randv,
-	           float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy,
-	           float &pdf, float3 &eval) const
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
 	{
-		pdf = 0;
-		return LABEL_NONE;
+		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		 * since this is set statically and only used for caustic flags, this
+		 * is probably as good as it gets. */
+	    if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(bsdf && extra) {
+				bsdf->extra = extra;
+				bsdf->extra->color = color;
+				return bsdf;
+			}
+		}
+
+		return NULL;
 	}
 };
 
 class MicrofacetMultiGGXClosure : public MicrofacetMultiClosure {
 public:
-	MicrofacetMultiGGXClosure() : MicrofacetMultiClosure() {}
-
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		MicrofacetMultiClosure::setup();
-		m_shaderdata_flag = bsdf_microfacet_multi_ggx_setup(&sc);
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_setup(bsdf) : 0;
 	}
 };
 
 ClosureParam *closure_bsdf_microfacet_multi_ggx_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, sc.N),
-		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, sc.data0),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
 		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
 		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)
@@ -331,22 +327,20 @@ CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_prepare, MicrofacetMultiGGXCl
 
 class MicrofacetMultiGGXAnisoClosure : public MicrofacetMultiClosure {
 public:
-	MicrofacetMultiGGXAnisoClosure() : MicrofacetMultiClosure() {}
-
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		MicrofacetMultiClosure::setup();
-		m_shaderdata_flag = bsdf_microfacet_multi_ggx_aniso_setup(&sc);
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_setup(bsdf) : 0;
 	}
 };
 
 ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, sc.N),
-		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, sc.T),
-		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, sc.data0),
-		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, sc.data1),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_y),
 		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
 		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)
@@ -359,19 +353,19 @@ class MicrofacetMultiGGXGlassClosure : public MicrofacetMultiClosure {
 public:
 	MicrofacetMultiGGXGlassClosure() : MicrofacetMultiClosure() {}
 
-	void setup()
+	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		MicrofacetMultiClosure::setup();
-		m_shaderdata_flag = bsdf_microfacet_multi_ggx_glass_setup(&sc);
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_setup(bsdf) : 0;
 	}
 };
 
 ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
 {
 	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, sc.N),
-		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, sc.data0),
-		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, sc.data2),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.ior),
 		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
 		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index c5a1a29b6af..cd7b33703ff 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -90,21 +90,7 @@ void name(RendererServices *, int id, void *data) \
 
 class CClosurePrimitive {
 public:
-	enum Category {
-		BSDF,             ///< Reflective and/or transmissive surface
-		BSSRDF,           ///< Sub-surface light transfer
-		Emissive,         ///< Light emission
-		Background,       ///< Background emission
-		Volume,           ///< Volume scattering
-		Holdout,          ///< Holdout from alpha
-		AmbientOcclusion, ///< Ambient occlusion
-	};
-
-	CClosurePrimitive (Category category_) : category (category_) {}
-	virtual ~CClosurePrimitive() {}
-	virtual void setup() {}
-
-	Category category;
+	virtual void setup(ShaderData *sd, int path_flag, float3 weight) = 0;
 
 	OSL::ustring label;
 };
@@ -113,68 +99,22 @@ public:
 
 class CBSDFClosure : public CClosurePrimitive {
 public:
-	ShaderClosure sc;
-
-	CBSDFClosure(int scattering) : CClosurePrimitive(BSDF),
-	  m_scattering_label(scattering), m_shaderdata_flag(0)
-	{}
-
-	int scattering() const { return m_scattering_label; }
-	int shaderdata_flag() const { return m_shaderdata_flag; }
-
-	virtual void blur(float roughness) = 0;
-	virtual float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float &pdf) const = 0;
-	virtual float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float &pdf) const = 0;
-
-	virtual int sample(const float3 &Ng,
-	                   const float3 &omega_out, const float3 &domega_out_dx, const float3 &domega_out_dy,
-	                   float randu, float randv,
-	                   float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy,
-	                   float &pdf, float3 &eval) const = 0;
-
-protected:
-	int m_scattering_label;
-	int m_shaderdata_flag;
+	bool skip(const ShaderData *sd, int path_flag, int scattering);
 };
 
-#define BSDF_CLOSURE_CLASS_BEGIN(Upper, lower, svmlower, TYPE) \
+#define BSDF_CLOSURE_CLASS_BEGIN(Upper, lower, structname, TYPE) \
 \
 class Upper##Closure : public CBSDFClosure { \
 public: \
-	Upper##Closure() : CBSDFClosure(TYPE) \
-	{ \
-	} \
+	structname params; \
+	float3 unused; \
 \
-	void setup() \
+	void setup(ShaderData *sd, int path_flag, float3 weight) \
 	{ \
-		sc.prim = NULL; \
-		m_shaderdata_flag = bsdf_##lower##_setup(&sc); \
-	} \
-\
-	void blur(float roughness) \
-	{ \
-	} \
-\
-	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
-	{ \
-		pdf = 0.0f; \
-		return make_float3(0.0f, 0.0f, 0.0f); \
-	} \
-\
-	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
-	{ \
-		pdf = 0.0f; \
-		return make_float3(0.0f, 0.0f, 0.0f); \
-	} \
-\
-	int sample(const float3 &Ng, \
-	           const float3 &omega_out, const float3 &domega_out_dx, const float3 &domega_out_dy, \
-	           float randu, float randv, \
-	           float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy, \
-	           float &pdf, float3 &eval) const \
-	{ \
-		pdf = 0; \
-		return LABEL_NONE; \
+	    if(!skip(sd, path_flag, TYPE)) { \
+			structname *bsdf = (structname*)bsdf_alloc_osl(sd, sizeof(structname), weight, &params); \
+			sd->flag |= (bsdf) ? bsdf_##lower##_setup(bsdf) : 0; \
+		} \
 	} \
 }; \
 \
@@ -193,36 +133,18 @@ static ClosureParam *bsdf_##lower##_params() \
 \
 CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure)
 
-
 /* Volume */
 
-class CVolumeClosure : public CClosurePrimitive {
-public:
-	ShaderClosure sc;
-
-	CVolumeClosure(int scattering) : CClosurePrimitive(Volume),
-	  m_scattering_label(scattering), m_shaderdata_flag(0)
-	{}
-	~CVolumeClosure() { }
-
-	int scattering() const { return m_scattering_label; }
-	int shaderdata_flag() const { return m_shaderdata_flag; }
-
-protected:
-	int m_scattering_label;
-	int m_shaderdata_flag;
-};
-
-#define VOLUME_CLOSURE_CLASS_BEGIN(Upper, lower, TYPE) \
+#define VOLUME_CLOSURE_CLASS_BEGIN(Upper, lower, structname, TYPE) \
 \
-class Upper##Closure : public CVolumeClosure { \
+class Upper##Closure : public CBSDFClosure { \
 public: \
-	Upper##Closure() : CVolumeClosure(TYPE) {} \
+	structname params; \
 \
-	void setup() \
+	void setup(ShaderData *sd, int path_flag, float3 weight) \
 	{ \
-		sc.prim = NULL; \
-		m_shaderdata_flag = volume_##lower##_setup(&sc); \
+	    structname *volume = (structname*)bsdf_alloc_osl(sd, sizeof(structname), weight, &params); \
+		sd->flag |= (volume) ? volume_##lower##_setup(volume) : 0; \
 	} \
 }; \
 \
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 916542ec628..8353c4e434b 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -59,8 +59,7 @@ struct OSLGlobals {
 	/* attributes */
 	struct Attribute {
 		TypeDesc type;
-		AttributeElement elem;
-		int offset;
+		AttributeDescriptor desc;
 		ParamValue value;
 	};
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2bb2be5e6b3..153ebad6cd2 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -554,13 +554,13 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd,
 	   attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
 	{
 		float3 fval[3];
-		fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset,
+		fval[0] = primitive_attribute_float3(kg, sd, attr.desc,
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float3(fval, type, derivatives, val);
 	}
 	else if(attr.type == TypeDesc::TypeFloat) {
 		float fval[3];
-		fval[0] = primitive_attribute_float(kg, sd, attr.elem, attr.offset,
+		fval[0] = primitive_attribute_float(kg, sd, attr.desc,
 		                                    (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float(fval, type, derivatives, val);
 	}
@@ -573,7 +573,7 @@ static bool get_mesh_attribute(KernelGlobals *kg, const ShaderData *sd, const OS
                                const TypeDesc& type, bool derivatives, void *val)
 {
 	if(attr.type == TypeDesc::TypeMatrix) {
-		Transform tfm = primitive_attribute_matrix(kg, sd, attr.offset);
+		Transform tfm = primitive_attribute_matrix(kg, sd, attr.desc);
 		return set_attribute_matrix(tfm, type, val);
 	}
 	else {
@@ -787,7 +787,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
                                       TypeDesc type, ustring name, void *val)
 {
 	KernelGlobals *kg = sd->osl_globals;
-	bool is_curve;
+	int prim_type = 0;
 	int object;
 
 	/* lookup of attribute on another object */
@@ -798,25 +798,24 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 			return false;
 
 		object = it->second;
-		is_curve = false;
 	}
 	else {
 		object = sd->object;
-		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+		prim_type = attribute_primitive_type(kg, sd);
 
 		if(object == OBJECT_NONE)
 			return get_background_attribute(kg, sd, name, type, derivatives, val);
 	}
 
 	/* find attribute on object */
-	object = object*ATTR_PRIM_TYPES + (is_curve == true);
+	object = object*ATTR_PRIM_TYPES + prim_type;
 	OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object];
 	OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
 
 	if(it != attribute_map.end()) {
 		const OSLGlobals::Attribute& attr = it->second;
 
-		if(attr.elem != ATTR_ELEMENT_OBJECT) {
+		if(attr.desc.element != ATTR_ELEMENT_OBJECT) {
 			/* triangle and vertex attributes */
 			if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val))
 				return true;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 6cde7419e10..20dd167708c 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -23,10 +23,6 @@
 
 #include "geom/geom_object.h"
 
-#include "closure/bsdf_diffuse.h"
-#include "closure/bssrdf.h"
-
-#include "osl_bssrdf.h"
 #include "osl_closures.h"
 #include "osl_globals.h"
 #include "osl_services.h"
@@ -141,8 +137,10 @@ static void shaderdata_to_shaderglobals(KernelGlobals *kg, ShaderData *sd, PathS
 
 /* Surface */
 
-static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
-                                         const OSL::ClosureColor *closure, float3 weight = make_float3(1.0f, 1.0f, 1.0f))
+static void flatten_surface_closure_tree(ShaderData *sd,
+                                         int path_flag,
+                                         const OSL::ClosureColor *closure,
+                                         float3 weight = make_float3(1.0f, 1.0f, 1.0f))
 {
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
@@ -164,164 +162,10 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
 			if(prim) {
-				ShaderClosure sc;
-
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
 				weight = weight*TO_FLOAT3(comp->w);
 #endif
-				sc.weight = weight;
-
-				prim->setup();
-
-				switch(prim->category) {
-					case CClosurePrimitive::BSDF: {
-						CBSDFClosure *bsdf = (CBSDFClosure *)prim;
-						int scattering = bsdf->scattering();
-						int shaderdata_flag = bsdf->shaderdata_flag();
-
-						/* caustic options */
-						if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-							KernelGlobals *kg = sd->osl_globals;
-
-							if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
-							   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT)))
-							{
-								return;
-							}
-						}
-
-						/* sample weight */
-						float sample_weight = fabsf(average(weight));
-
-						sc.sample_weight = sample_weight;
-
-						sc.type = bsdf->sc.type;
-						sc.N = bsdf->sc.N;
-						sc.T = bsdf->sc.T;
-						sc.data0 = bsdf->sc.data0;
-						sc.data1 = bsdf->sc.data1;
-						sc.data2 = bsdf->sc.data2;
-						sc.prim = bsdf->sc.prim;
-						if(shaderdata_flag & SD_BSDF_HAS_CUSTOM) {
-							sc.custom1 = bsdf->sc.custom1;
-							sc.custom2 = bsdf->sc.custom2;
-							sc.custom3 = bsdf->sc.custom3;
-						}
-
-						/* add */
-						if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= shaderdata_flag;
-						}
-						break;
-					}
-					case CClosurePrimitive::Emissive: {
-						/* sample weight */
-						float sample_weight = fabsf(average(weight));
-
-						sc.sample_weight = sample_weight;
-						sc.type = CLOSURE_EMISSION_ID;
-						sc.data0 = 0.0f;
-						sc.data1 = 0.0f;
-						sc.data2 = 0.0f;
-						sc.prim = NULL;
-
-						/* flag */
-						if(sd->num_closure < MAX_CLOSURE) {
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= SD_EMISSION;
-						}
-						break;
-					}
-					case CClosurePrimitive::AmbientOcclusion: {
-						/* sample weight */
-						float sample_weight = fabsf(average(weight));
-
-						sc.sample_weight = sample_weight;
-						sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
-						sc.data0 = 0.0f;
-						sc.data1 = 0.0f;
-						sc.data2 = 0.0f;
-						sc.prim = NULL;
-
-						if(sd->num_closure < MAX_CLOSURE) {
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= SD_AO;
-						}
-						break;
-					}
-					case CClosurePrimitive::Holdout: {
-						sc.sample_weight = 0.0f;
-						sc.type = CLOSURE_HOLDOUT_ID;
-						sc.data0 = 0.0f;
-						sc.data1 = 0.0f;
-						sc.data2 = 0.0f;
-						sc.prim = NULL;
-
-						if(sd->num_closure < MAX_CLOSURE) {
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= SD_HOLDOUT;
-						}
-						break;
-					}
-					case CClosurePrimitive::BSSRDF: {
-						CBSSRDFClosure *bssrdf = (CBSSRDFClosure *)prim;
-						float sample_weight = fabsf(average(weight));
-
-						if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
-							sc.sample_weight = sample_weight;
-
-							sc.type = bssrdf->sc.type;
-							sc.N = bssrdf->sc.N;
-							sc.data1 = bssrdf->sc.data1;
-							sc.T.x = bssrdf->sc.T.x;
-							sc.prim = NULL;
-
-							/* disable in case of diffuse ancestor, can't see it well then and
-							 * adds considerably noise due to probabilities of continuing path
-							 * getting lower and lower */
-							if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
-								bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
-
-							float3 albedo =
-							        (bssrdf->sc.type == CLOSURE_BSSRDF_BURLEY_ID)
-							                ? bssrdf->albedo
-							                : make_float3(0.0f, 0.0f, 0.0f);
-
-							/* create one closure for each color channel */
-							if(fabsf(weight.x) > 0.0f) {
-								sc.weight = make_float3(weight.x, 0.0f, 0.0f);
-								sc.data0 = bssrdf->radius.x;
-								sc.data1 = 0.0f;
-								sc.data2 = albedo.x;
-								sd->flag |= bssrdf_setup(&sc, sc.type);
-								sd->closure[sd->num_closure++] = sc;
-							}
-
-							if(fabsf(weight.y) > 0.0f) {
-								sc.weight = make_float3(0.0f, weight.y, 0.0f);
-								sc.data0 = bssrdf->radius.y;
-								sc.data1 = 0.0f;
-								sc.data2 = albedo.y;
-								sd->flag |= bssrdf_setup(&sc, sc.type);
-								sd->closure[sd->num_closure++] = sc;
-							}
-
-							if(fabsf(weight.z) > 0.0f) {
-								sc.weight = make_float3(0.0f, 0.0f, weight.z);
-								sc.data0 = bssrdf->radius.z;
-								sc.data1 = 0.0f;
-								sc.data2 = albedo.z;
-								sd->flag |= bssrdf_setup(&sc, sc.type);
-								sd->closure[sd->num_closure++] = sc;
-							}
-						}
-						break;
-					}
-					case CClosurePrimitive::Background:
-					case CClosurePrimitive::Volume:
-						break; /* not relevant */
-				}
+				prim->setup(sd, path_flag, weight);
 			}
 			break;
 		}
@@ -351,7 +195,9 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 
 /* Background */
 
-static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure)
+static void flatten_background_closure_tree(ShaderData *sd,
+                                            const OSL::ClosureColor *closure,
+                                            float3 weight = make_float3(1.0f, 1.0f, 1.0f))
 {
 	/* OSL gives us a closure tree, if we are shading for background there
 	 * is only one supported closure type at the moment, which has no evaluation
@@ -360,32 +206,32 @@ static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure)
 	switch(closure->id) {
 		case OSL::ClosureColor::MUL: {
 			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-
-			return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
+			flatten_background_closure_tree(sd, mul->closure, weight * TO_FLOAT3(mul->weight));
+			break;
 		}
 		case OSL::ClosureColor::ADD: {
 			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 
-			return flatten_background_closure_tree(add->closureA) +
-			       flatten_background_closure_tree(add->closureB);
+			flatten_background_closure_tree(sd, add->closureA, weight);
+			flatten_background_closure_tree(sd, add->closureB, weight);
+			break;
 		}
 		default: {
 			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
 			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-			if(prim && prim->category == CClosurePrimitive::Background)
+			if(prim) {
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-				return TO_FLOAT3(comp->w);
-#else
-				return make_float3(1.0f, 1.0f, 1.0f);
+				weight = weight*TO_FLOAT3(comp->w);
 #endif
+				prim->setup(sd, 0, weight);
+			}
+			break;
 		}
 	}
-
-	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -402,15 +248,14 @@ float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *
 
 	/* return background color immediately */
 	if(globals->Ci)
-		return flatten_background_closure_tree(globals->Ci);
-
-	return make_float3(0.0f, 0.0f, 0.0f);
+		flatten_background_closure_tree(sd, globals->Ci);
 }
 
 /* Volume */
 
 static void flatten_volume_closure_tree(ShaderData *sd,
-                                        const OSL::ClosureColor *closure, float3 weight = make_float3(1.0f, 1.0f, 1.0f))
+                                        const OSL::ClosureColor *closure,
+                                        float3 weight = make_float3(1.0f, 1.0f, 1.0f))
 {
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
@@ -432,60 +277,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
 			if(prim) {
-				ShaderClosure sc;
-
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
 				weight = weight*TO_FLOAT3(comp->w);
 #endif
-				sc.weight = weight;
-
-				prim->setup();
-
-				switch(prim->category) {
-					case CClosurePrimitive::Volume: {
-						CVolumeClosure *volume = (CVolumeClosure *)prim;
-						/* sample weight */
-						float sample_weight = fabsf(average(weight));
-
-						sc.sample_weight = sample_weight;
-						sc.type = volume->sc.type;
-						sc.data0 = volume->sc.data0;
-						sc.data1 = volume->sc.data1;
-
-						/* add */
-						if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
-						   (sd->num_closure < MAX_CLOSURE))
-						{
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= volume->shaderdata_flag();
-						}
-						break;
-					}
-					case CClosurePrimitive::Emissive: {
-						/* sample weight */
-						float sample_weight = fabsf(average(weight));
-
-						sc.sample_weight = sample_weight;
-						sc.type = CLOSURE_EMISSION_ID;
-						sc.data0 = 0.0f;
-						sc.data1 = 0.0f;
-						sc.prim = NULL;
-
-						/* flag */
-						if(sd->num_closure < MAX_CLOSURE) {
-							sd->closure[sd->num_closure++] = sc;
-							sd->flag |= SD_EMISSION;
-						}
-						break;
-					}
-					case CClosurePrimitive::Holdout:
-						break; /* not implemented */
-					case CClosurePrimitive::Background:
-					case CClosurePrimitive::BSDF:
-					case CClosurePrimitive::BSSRDF:
-					case CClosurePrimitive::AmbientOcclusion:
-						break; /* not relevant */
-				}
+				prim->setup(sd, 0, weight);
 			}
 		}
 	}
@@ -537,43 +332,9 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
 	sd->P = TO_FLOAT3(globals->P);
 }
 
-/* BSDF Closure */
-
-int OSLShader::bsdf_sample(const ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3& eval, float3& omega_in, differential3& domega_in, float& pdf)
-{
-	CBSDFClosure *sample_bsdf = (CBSDFClosure *)sc->prim;
-
-	pdf = 0.0f;
-
-	return sample_bsdf->sample(sd->Ng,
-	                           sd->I, sd->dI.dx, sd->dI.dy,
-	                           randu, randv,
-	                           omega_in, domega_in.dx, domega_in.dy,
-	                           pdf, eval);
-}
-
-float3 OSLShader::bsdf_eval(const ShaderData *sd, const ShaderClosure *sc, const float3& omega_in, float& pdf)
-{
-	CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim;
-	float3 bsdf_eval;
-
-	if(dot(sd->Ng, omega_in) >= 0.0f)
-		bsdf_eval = bsdf->eval_reflect(sd->I, omega_in, pdf);
-	else
-		bsdf_eval = bsdf->eval_transmit(sd->I, omega_in, pdf);
-	
-	return bsdf_eval;
-}
-
-void OSLShader::bsdf_blur(ShaderClosure *sc, float roughness)
-{
-	CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim;
-	bsdf->blur(roughness);
-}
-
 /* Attributes */
 
-int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
+int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc)
 {
 	/* for OSL, a hash map is used to lookup the attribute by name. */
 	int object = sd->object*ATTR_PRIM_TYPES;
@@ -587,16 +348,23 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 
 	if(it != attr_map.end()) {
 		const OSLGlobals::Attribute &osl_attr = it->second;
-		*elem = osl_attr.elem;
+		*desc = osl_attr.desc;
 
-		if(sd->prim == PRIM_NONE && (AttributeElement)osl_attr.elem != ATTR_ELEMENT_MESH)
+		if(sd->prim == PRIM_NONE && (AttributeElement)osl_attr.desc.element != ATTR_ELEMENT_MESH) {
+			desc->offset = ATTR_STD_NOT_FOUND;
 			return ATTR_STD_NOT_FOUND;
+		}
 
 		/* return result */
-		return (osl_attr.elem == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : osl_attr.offset;
+		if(osl_attr.desc.element == ATTR_ELEMENT_NONE) {
+			desc->offset = ATTR_STD_NOT_FOUND;
+		}
+		return desc->offset;
 	}
-	else
+	else {
+		desc->offset = ATTR_STD_NOT_FOUND;
 		return (int)ATTR_STD_NOT_FOUND;
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 7d26cd40da5..ad06dd6929d 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -54,20 +54,12 @@ public:
 
 	/* eval */
 	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static float3 eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
+	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
 	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
 	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx);
 
-	/* sample & eval */
-	static int bsdf_sample(const ShaderData *sd, const ShaderClosure *sc,
-	                       float randu, float randv,
-	                       float3& eval, float3& omega_in, differential3& domega_in, float& pdf);
-	static float3 bsdf_eval(const ShaderData *sd, const ShaderClosure *sc,
-	                        const float3& omega_in, float& pdf);
-	static void bsdf_blur(ShaderClosure *sc, float roughness);
-
 	/* attributes */
-	static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem);
+	static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/shaders/node_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
index c8cb88f0c0b..ef8f2fae894 100644
--- a/intern/cycles/kernel/shaders/node_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
@@ -24,34 +24,35 @@ shader node_hair_bsdf(
 	float Offset = 0.0,
 	float RoughnessU = 0.1,
 	float RoughnessV = 1.0,
-	normal Normal = Ng,
+	normal Tangent = normal(0, 0, 0),
 	output closure color BSDF = 0)
 {
-	float IsStrand;
 	float roughnessh = clamp(RoughnessU, 0.001, 1.0);
 	float roughnessv = clamp(RoughnessV, 0.001, 1.0);
-	getattribute("geom:is_curve", IsStrand);
+	float offset = -Offset;
 
-	if (!IsStrand) {
-		if (backfacing()) {
-			BSDF = transparent();
-		}
-		else {
-			if (component == "reflection")
-				BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
-			else
-				BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
-		}
+	normal T;
+	float IsCurve = 0;
+	getattribute("geom:is_curve", IsCurve);
+
+	if (isconnected(Tangent)) {
+		T = Tangent;
+	}
+	else if(!IsCurve) {
+		T = normalize(dPdv);
+		offset = 0.0;
+	}
+	else {
+		T = normalize(dPdu);
+	}
+
+	if (backfacing() && IsCurve) {
+		BSDF = transparent();
 	}
 	else {
-		if (backfacing()) {
-			BSDF = transparent();
-		}
-		else {
-			if (component == "reflection")
-				BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, dPdu, -Offset);
-			else
-				BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, dPdu, -Offset);
-		}
+		if (component == "reflection")
+			BSDF = Color * hair_reflection(Ng, roughnessh, roughnessv, T, offset);
+		else
+			BSDF = Color * hair_transmission(Ng, roughnessh, roughnessv, T, offset);
 	}
 }
diff --git a/intern/cycles/kernel/shaders/node_ramp_util.h b/intern/cycles/kernel/shaders/node_ramp_util.h
new file mode 100644
index 00000000000..917fb65c6df
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_ramp_util.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
+
+color rgb_ramp_lookup(color ramp[], float at, int interpolate, int extrapolate)
+{
+	float f = at;
+	int table_size = arraylength(ramp);
+
+	if ((f < 0.0 || f > 1.0) && extrapolate) {
+		color t0, dy;
+		if (f < 0.0) {
+			t0 = ramp[0];
+			dy = t0 - ramp[1];
+			f = -f;
+		}
+		else {
+			t0 = ramp[table_size - 1];
+			dy = t0 - ramp[table_size - 2];
+			f = f - 1.0;
+		}
+		return t0 + dy * f * (table_size - 1);
+	}
+
+	f = clamp(at, 0.0, 1.0) * (table_size - 1);
+
+	/* clamp int as well in case of NaN */
+	int i = (int)f;
+	if (i < 0) i = 0;
+	if (i >= table_size) i = table_size - 1;
+	float t = f - (float)i;
+
+	color result = ramp[i];
+
+	if (interpolate && t > 0.0)
+		result = (1.0 - t) * result + t * ramp[i + 1];
+
+	return result;
+}
+
+float rgb_ramp_lookup(float ramp[], float at, int interpolate, int extrapolate)
+{
+	float f = at;
+	int table_size = arraylength(ramp);
+
+	if ((f < 0.0 || f > 1.0) && extrapolate) {
+		float t0, dy;
+		if (f < 0.0) {
+			t0 = ramp[0];
+			dy = t0 - ramp[1];
+			f = -f;
+		}
+		else {
+			t0 = ramp[table_size - 1];
+			dy = t0 - ramp[table_size - 2];
+			f = f - 1.0;
+		}
+		return t0 + dy * f * (table_size - 1);
+	}
+
+	f = clamp(at, 0.0, 1.0) * (table_size - 1);
+
+	/* clamp int as well in case of NaN */
+	int i = (int)f;
+	if (i < 0) i = 0;
+	if (i >= table_size) i = table_size - 1;
+	float t = f - (float)i;
+
+	float result = ramp[i];
+
+	if (interpolate && t > 0.0)
+		result = (1.0 - t) * result + t * ramp[i + 1];
+	
+	return result;
+}
diff --git a/intern/cycles/kernel/shaders/node_rgb_curves.osl b/intern/cycles/kernel/shaders/node_rgb_curves.osl
index 8e208e8a8f7..c8e7e4f175b 100644
--- a/intern/cycles/kernel/shaders/node_rgb_curves.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_curves.osl
@@ -14,43 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "oslutil.h"
-
-float ramp_lookup(color ramp[], float at, int component)
-{
-	int table_size = arraylength(ramp);
-
-	if (at < 0.0 || at > 1.0) {
-		float t0, dy;
-		if (at < 0.0) {
-			t0 = ramp[0][component];
-			dy = t0 - ramp[1][component];
-			at = -at;
-		}
-		else {
-			t0 = ramp[table_size - 1][component];
-			dy = t0 - ramp[table_size - 2][component];
-			at = at - 1.0;
-		}
-		return t0 + dy * at * (table_size - 1);
-	}
-
-	float f = clamp(at, 0.0, 1.0) * (table_size - 1);
-
-	/* clamp int as well in case of NaN */
-	int i = (int)f;
-	if (i < 0) i = 0;
-	if (i >= table_size) i = table_size - 1;
-	float t = f - (float)i;
-
-	float result = ramp[i][component];
-
-	if (t > 0.0)
-		result = (1.0 - t) * result + t * ramp[i + 1][component];
-	
-	return result;
-}
+#include "node_ramp_util.h"
 
 shader node_rgb_curves(
 	color ramp[] = {0.0},
@@ -63,9 +27,13 @@ shader node_rgb_curves(
 {
 	color c = (ColorIn - color(min_x, min_x, min_x)) / (max_x - min_x);
 
-	ColorOut[0] = ramp_lookup(ramp, c[0], 0);
-	ColorOut[1] = ramp_lookup(ramp, c[1], 1);
-	ColorOut[2] = ramp_lookup(ramp, c[2], 2);
+	color r = rgb_ramp_lookup(ramp, c[0], 1, 1);
+	color g = rgb_ramp_lookup(ramp, c[0], 1, 1);
+	color b = rgb_ramp_lookup(ramp, c[0], 1, 1);
+
+	ColorOut[0] = r[0];
+	ColorOut[1] = g[1];
+	ColorOut[2] = b[2];
 
 	ColorOut = mix(ColorIn, ColorOut, Fac);
 }
diff --git a/intern/cycles/kernel/shaders/node_rgb_ramp.osl b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
index c0ae74d6b33..24b8728b999 100644
--- a/intern/cycles/kernel/shaders/node_rgb_ramp.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "oslutil.h"
+#include "node_ramp_util.h"
 
 shader node_rgb_ramp(
 	color ramp_color[] = {0.0},
@@ -26,21 +25,7 @@ shader node_rgb_ramp(
 	output color Color = 0.0,
 	output float Alpha = 1.0)
 {
-	int table_size = arraylength(ramp_color);
-	float f = clamp(Fac, 0.0, 1.0) * (table_size - 1);
-
-	/* clamp int as well in case of NaN */
-	int i = (int)f;
-	if (i < 0) i = 0;
-	if (i >= table_size) i = table_size - 1;
-	float t = f - (float)i;
-
-	Color = ramp_color[i];
-	Alpha = ramp_alpha[i];
-
-	if (interpolate && t > 0.0) {
-		Color = (1.0 - t) * Color + t * ramp_color[i + 1];
-		Alpha = (1.0 - t) * Alpha + t * ramp_alpha[i + 1];
-	}
+	Color = rgb_ramp_lookup(ramp_color, Fac, interpolate, 0);
+	Alpha = rgb_ramp_lookup(ramp_alpha, Fac, interpolate, 0);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_vector_curves.osl b/intern/cycles/kernel/shaders/node_vector_curves.osl
index cff4efe1d98..d92fa11d439 100644
--- a/intern/cycles/kernel/shaders/node_vector_curves.osl
+++ b/intern/cycles/kernel/shaders/node_vector_curves.osl
@@ -14,43 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "oslutil.h"
-
-float ramp_lookup(color ramp[], float at, int component)
-{
-	int table_size = arraylength(ramp);
-
-	if (at < 0.0 || at > 1.0) {
-		float t0, dy;
-		if (at < 0.0) {
-			t0 = ramp[0][component];
-			dy = t0 - ramp[1][component];
-			at = -at;
-		}
-		else {
-			t0 = ramp[table_size - 1][component];
-			dy = t0 - ramp[table_size - 2][component];
-			at = at - 1.0;
-		}
-		return t0 + dy * at * (table_size - 1);
-	}
-
-	float f = clamp(at, 0.0, 1.0) * (table_size - 1);
-
-	/* clamp int as well in case of NaN */
-	int i = (int)f;
-	if (i < 0) i = 0;
-	if (i >= table_size) i = table_size - 1;
-	float t = f - (float)i;
-
-	float result = ramp[i][component];
-
-	if (t > 0.0)
-		result = (1.0 - t) * result + t * ramp[i + 1][component];
-	
-	return result;
-}
+#include "node_ramp_util.h"
 
 shader node_vector_curves(
 	color ramp[] = {0.0},
@@ -63,9 +27,13 @@ shader node_vector_curves(
 {
 	vector c = (VectorIn - vector(min_x, min_x, min_x)) / (max_x - min_x);
 
-	VectorOut[0] = ramp_lookup(ramp, c[0], 0);
-	VectorOut[1] = ramp_lookup(ramp, c[1], 1);
-	VectorOut[2] = ramp_lookup(ramp, c[2], 2);
+	color r = rgb_ramp_lookup(ramp, c[0], 1, 1);
+	color g = rgb_ramp_lookup(ramp, c[0], 1, 1);
+	color b = rgb_ramp_lookup(ramp, c[0], 1, 1);
+
+	VectorOut[0] = r[0];
+	VectorOut[1] = g[1];
+	VectorOut[2] = b[2];
 
 	VectorOut = mix(VectorIn, VectorOut, Fac);
 }
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index de7e03e5a19..502994e71f1 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -405,10 +405,8 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_3)
 			case NODE_RGB_CURVES:
-				svm_node_rgb_curves(kg, sd, stack, node, &offset);
-				break;
 			case NODE_VECTOR_CURVES:
-				svm_node_vector_curves(kg, sd, stack, node, &offset);
+				svm_node_curves(kg, sd, stack, node, &offset);
 				break;
 			case NODE_TANGENT:
 				svm_node_tangent(kg, sd, stack, node);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 63bbb27d873..de978a423b4 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,144 +18,136 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
+ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
-	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
+	uint *out_offset)
 {
 	*out_offset = node.z;
 	*type = (NodeAttributeType)node.w;
+
+	AttributeDescriptor desc;
+
 	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		/* find attribute by unique id */
-		uint id = node.y;
-		uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
-#ifdef __HAIR__
-		attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
-#endif
-		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
-		
-		while(attr_map.x != id) {
-			if(UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
-				*elem = ATTR_ELEMENT_NONE;
-				*offset = 0;
-				*mesh_type = (NodeAttributeType)node.w;
-				return;
-			}
-			attr_offset += ATTR_PRIM_TYPES;
-			attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+		desc = find_attribute(kg, sd, node.y);
+		if(desc.offset == ATTR_STD_NOT_FOUND) {
+			desc.element = ATTR_ELEMENT_NONE;
+			desc.offset = 0;
+			desc.type = (NodeAttributeType)node.w;
 		}
-
-		/* return result */
-		*elem = (AttributeElement)attr_map.y;
-		*offset = as_int(attr_map.z);
-		*mesh_type = (NodeAttributeType)attr_map.w;
 	}
 	else {
 		/* background */
-		*elem = ATTR_ELEMENT_NONE;
-		*offset = 0;
-		*mesh_type = (NodeAttributeType)node.w;
+		desc.element = ATTR_ELEMENT_NONE;
+		desc.offset = 0;
+		desc.type = (NodeAttributeType)node.w;
 	}
+
+	return desc;
 }
 
 ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
-	NodeAttributeType type, mesh_type;
-	AttributeElement elem;
+	NodeAttributeType type;
 	uint out_offset;
-	int offset;
-
-	svm_node_attr_init(kg, sd, node, &type, &mesh_type, &elem, &offset, &out_offset);
+	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 	/* fetch and store attribute */
 	if(type == NODE_ATTR_FLOAT) {
-		if(mesh_type == NODE_ATTR_FLOAT) {
-			float f = primitive_attribute_float(kg, sd, elem, offset, NULL, NULL);
+		if(desc.type == NODE_ATTR_FLOAT) {
+			float f = primitive_attribute_float(kg, sd, desc, NULL, NULL);
 			stack_store_float(stack, out_offset, f);
 		}
 		else {
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
+			float3 f = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 			stack_store_float(stack, out_offset, average(f));
 		}
 	}
 	else {
-		if(mesh_type == NODE_ATTR_FLOAT3) {
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
+		if(desc.type == NODE_ATTR_FLOAT3) {
+			float3 f = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 			stack_store_float3(stack, out_offset, f);
 		}
 		else {
-			float f = primitive_attribute_float(kg, sd, elem, offset, NULL, NULL);
+			float f = primitive_attribute_float(kg, sd, desc, NULL, NULL);
 			stack_store_float3(stack, out_offset, make_float3(f, f, f));
 		}
 	}
 }
 
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_noinline
+#endif
+void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
-	NodeAttributeType type, mesh_type;
-	AttributeElement elem;
+	NodeAttributeType type;
 	uint out_offset;
-	int offset;
-
-	svm_node_attr_init(kg, sd, node, &type, &mesh_type, &elem, &offset, &out_offset);
+	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 	/* fetch and store attribute */
 	if(type == NODE_ATTR_FLOAT) {
-		if(mesh_type == NODE_ATTR_FLOAT) {
+		if(desc.type == NODE_ATTR_FLOAT) {
 			float dx;
-			float f = primitive_attribute_float(kg, sd, elem, offset, &dx, NULL);
+			float f = primitive_attribute_float(kg, sd, desc, &dx, NULL);
 			stack_store_float(stack, out_offset, f+dx);
 		}
 		else {
 			float3 dx;
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, &dx, NULL);
+			float3 f = primitive_attribute_float3(kg, sd, desc, &dx, NULL);
 			stack_store_float(stack, out_offset, average(f+dx));
 		}
 	}
 	else {
-		if(mesh_type == NODE_ATTR_FLOAT3) {
+		if(desc.type == NODE_ATTR_FLOAT3) {
 			float3 dx;
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, &dx, NULL);
+			float3 f = primitive_attribute_float3(kg, sd, desc, &dx, NULL);
 			stack_store_float3(stack, out_offset, f+dx);
 		}
 		else {
 			float dx;
-			float f = primitive_attribute_float(kg, sd, elem, offset, &dx, NULL);
+			float f = primitive_attribute_float(kg, sd, desc, &dx, NULL);
 			stack_store_float3(stack, out_offset, make_float3(f+dx, f+dx, f+dx));
 		}
 	}
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_noinline
+#endif
+void svm_node_attr_bump_dy(KernelGlobals *kg,
+                           ShaderData *sd,
+                           float *stack,
+                           uint4 node)
 {
-	NodeAttributeType type, mesh_type;
-	AttributeElement elem;
+	NodeAttributeType type;
 	uint out_offset;
-	int offset;
-
-	svm_node_attr_init(kg, sd, node, &type, &mesh_type, &elem, &offset, &out_offset);
+	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 	/* fetch and store attribute */
 	if(type == NODE_ATTR_FLOAT) {
-		if(mesh_type == NODE_ATTR_FLOAT) {
+		if(desc.type == NODE_ATTR_FLOAT) {
 			float dy;
-			float f = primitive_attribute_float(kg, sd, elem, offset, NULL, &dy);
+			float f = primitive_attribute_float(kg, sd, desc, NULL, &dy);
 			stack_store_float(stack, out_offset, f+dy);
 		}
 		else {
 			float3 dy;
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, NULL, &dy);
+			float3 f = primitive_attribute_float3(kg, sd, desc, NULL, &dy);
 			stack_store_float(stack, out_offset, average(f+dy));
 		}
 	}
 	else {
-		if(mesh_type == NODE_ATTR_FLOAT3) {
+		if(desc.type == NODE_ATTR_FLOAT3) {
 			float3 dy;
-			float3 f = primitive_attribute_float3(kg, sd, elem, offset, NULL, &dy);
+			float3 f = primitive_attribute_float3(kg, sd, desc, NULL, &dy);
 			stack_store_float3(stack, out_offset, f+dy);
 		}
 		else {
 			float dy;
-			float f = primitive_attribute_float(kg, sd, elem, offset, NULL, &dy);
+			float f = primitive_attribute_float(kg, sd, desc, NULL, &dy);
 			stack_store_float3(stack, out_offset, make_float3(f+dy, f+dy, f+dy));
 		}
 	}
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index fae89aade60..017d697f9f8 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -18,104 +18,44 @@ CCL_NAMESPACE_BEGIN
 
 /* Closure Nodes */
 
-ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type, float eta, float roughness, bool refract)
+ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int type, float eta, float roughness, bool refract)
 {
 	if(type == CLOSURE_BSDF_SHARP_GLASS_ID) {
 		if(refract) {
-			sc->data0 = eta;
-			sc->data1 = 0.0f;
-			sc->data2 = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
+			bsdf->alpha_y = 0.0f;
+			bsdf->alpha_x = 0.0f;
+			bsdf->ior = eta;
+			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
 		}
 		else {
-			sc->data0 = 0.0f;
-			sc->data1 = 0.0f;
-			sc->data2 = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
+			bsdf->alpha_y = 0.0f;
+			bsdf->alpha_x = 0.0f;
+			bsdf->ior = 0.0f;
+			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
-		sc->data0 = roughness;
-		sc->data1 = roughness;
-		sc->data2 = eta;
+		bsdf->alpha_x = roughness;
+		bsdf->alpha_y = roughness;
+		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
-		sc->data0 = roughness;
-		sc->data1 = roughness;
-		sc->data2 = eta;
+		bsdf->alpha_x = roughness;
+		bsdf->alpha_y = roughness;
+		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
-ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
-{
-	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-
-	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
-		sc->weight *= mix_weight;
-		sc->type = type;
-		sc->data0 = 0.0f;
-		sc->data1 = 0.0f;
-		sc->data2 = 0.0f;
-#ifdef __OSL__
-		sc->prim = NULL;
-#endif
-		ccl_fetch(sd, num_closure)++;
-		return sc;
-	}
-
-	return NULL;
-}
-
-ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
-{
-	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-
-	float3 weight = sc->weight * mix_weight;
-	float sample_weight = fabsf(average(weight));
-
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
-		sc->weight = weight;
-		sc->sample_weight = sample_weight;
-		ccl_fetch(sd, num_closure)++;
-#ifdef __OSL__
-		sc->prim = NULL;
-#endif
-		return sc;
-	}
-
-	return NULL;
-}
-
-ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
-{
-	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-
-	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
-	float sample_weight = fabsf(average(weight));
-
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
-		sc->weight = weight;
-		sc->sample_weight = sample_weight;
-		ccl_fetch(sd, num_closure)++;
-#ifdef __OSL__
-		sc->prim = NULL;
-#endif
-		return sc;
-	}
-
-	return NULL;
-}
-
 ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int *offset)
 {
 	uint type, param1_offset, param2_offset;
@@ -137,49 +77,40 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 	switch(type) {
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
-			if(sc) {
-				sc->N = N;
+			if(bsdf) {
+				bsdf->N = N;
 
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					sc->data0 = 0.0f;
-					sc->data1 = 0.0f;
-					sc->data2 = 0.0f;
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
-					sc->data0 = roughness;
-					sc->data1 = 0.0f;
-					sc->data2 = 0.0f;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
+					bsdf->roughness = roughness;
+					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
-			if(sc) {
-				sc->data0 = 0.0f;
-				sc->data1 = 0.0f;
-				sc->data2 = 0.0f;
-				sc->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
+			if(bsdf) {
+				bsdf->N = N;
+				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
-			if(sc) {
-				sc->data0 = 0.0f;
-				sc->data1 = 0.0f;
-				sc->data2 = 0.0f;
-				sc->N = N;
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
+			if(bsdf) {
+				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
 			}
 			break;
 		}
@@ -192,31 +123,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
-			if(sc) {
-				sc->N = N;
-				sc->data0 = param1;
-				sc->data1 = param1;
-				sc->data2 = 0.0f;
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->alpha_x = param1;
+				bsdf->alpha_y = param1;
+				bsdf->ior = 0.0f;
+				bsdf->extra = NULL;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
 					kernel_assert(stack_valid(data_node.z));
-					float3 color = stack_load_float3(stack, data_node.z);
-					sc->custom1 = color.x;
-					sc->custom2 = color.y;
-					sc->custom3 = color.z;
-					ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(sc);
+					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+					if(bsdf->extra) {
+						bsdf->extra->color = stack_load_float3(stack, data_node.z);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -228,31 +161,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
-			if(sc) {
-				sc->N = N;
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
 				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
-					sc->data0 = eta;
-					sc->data1 = 0.0f;
-					sc->data2 = 0.0f;
+					bsdf->alpha_x = 0.0f;
+					bsdf->alpha_y = 0.0f;
+					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
 				}
 				else {
-					sc->data0 = param1;
-					sc->data1 = param1;
-					sc->data2 = eta;
+					bsdf->alpha_x = param1;
+					bsdf->alpha_y = param1;
+					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -268,7 +203,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			int num_closure = ccl_fetch(sd, num_closure);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
@@ -280,37 +215,30 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float roughness = param1;
 
 			/* reflection */
-			ShaderClosure *sc = ccl_fetch_array(sd, closure, num_closure);
-			float3 weight = sc->weight;
-			float sample_weight = sc->sample_weight;
-
-			sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel);
 #ifdef __CAUSTICS_TRICKS__
 			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
 #endif
 			{
-				if(sc) {
-					sc->N = N;
-					svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+				MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight*fresnel);
+
+				if(bsdf) {
+					bsdf->N = N;
+					bsdf->extra = NULL;
+					svm_node_glass_setup(sd, bsdf, type, eta, roughness, false);
 				}
 			}
 
+			/* refraction */
 #ifdef __CAUSTICS_TRICKS__
-			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
-				break;
+			if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
 #endif
+			{
+				MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight*(1.0f - fresnel));
 
-			/* refraction */
-			if(num_closure + 1 < MAX_CLOSURE) {
-				sc = ccl_fetch_array(sd, closure, num_closure + 1);
-				sc->weight = weight;
-				sc->sample_weight = sample_weight;
-
-				sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
-
-				if(sc) {
-					sc->N = N;
-					svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				if(bsdf) {
+					bsdf->N = N;
+					bsdf->extra = NULL;
+					svm_node_glass_setup(sd, bsdf, type, eta, roughness, true);
 				}
 			}
 
@@ -321,24 +249,25 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 
-			if(sc) {
-				sc->N = N;
+			if(bsdf && extra) {
+				bsdf->N = N;
+				bsdf->extra = extra;
+				bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 
-				sc->data0 = param1;
-				sc->data1 = param1;
+				bsdf->alpha_x = param1;
+				bsdf->alpha_y = param1;
 				float eta = fmaxf(param2, 1e-5f);
-				sc->data2 = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 				kernel_assert(stack_valid(data_node.z));
-				float3 color = stack_load_float3(stack, data_node.z);
-				sc->custom1 = color.x;
-				sc->custom2 = color.y;
-				sc->custom3 = color.z;
+				bsdf->extra->color = stack_load_float3(stack, data_node.z);
 
 				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			}
 
 			break;
@@ -351,62 +280,63 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
-
-			if(sc) {
-				sc->N = N;
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
-				sc->T = stack_load_float3(stack, data_node.y);
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->extra = NULL;
+				bsdf->T = stack_load_float3(stack, data_node.y);
 
 				/* rotate tangent */
 				float rotation = stack_load_float(stack, data_node.z);
 
 				if(rotation != 0.0f)
-					sc->T = rotate_around_axis(sc->T, sc->N, rotation * M_2PI_F);
+					bsdf->T = rotate_around_axis(bsdf->T, bsdf->N, rotation * M_2PI_F);
 
 				/* compute roughness */
 				float roughness = param1;
 				float anisotropy = clamp(param2, -0.99f, 0.99f);
 
 				if(anisotropy < 0.0f) {
-					sc->data0 = roughness/(1.0f + anisotropy);
-					sc->data1 = roughness*(1.0f + anisotropy);
+					bsdf->alpha_x = roughness/(1.0f + anisotropy);
+					bsdf->alpha_y = roughness*(1.0f + anisotropy);
 				}
 				else {
-					sc->data0 = roughness*(1.0f - anisotropy);
-					sc->data1 = roughness/(1.0f - anisotropy);
+					bsdf->alpha_x = roughness*(1.0f - anisotropy);
+					bsdf->alpha_y = roughness/(1.0f - anisotropy);
 				}
 
-				sc->data2 = 0.0f;
+				bsdf->ior = 0.0f;
 
-				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
-				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
+				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+				}
+				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
-					float3 color = stack_load_float3(stack, data_node.w);
-					sc->custom1 = color.x;
-					sc->custom2 = color.y;
-					sc->custom3 = color.z;
-					ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(sc);
+					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+					if(bsdf->extra) {
+						bsdf->extra->color = stack_load_float3(stack, data_node.w);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
-			if(sc) {
-				sc->N = N;
+			if(bsdf) {
+				bsdf->N = N;
 
-				/* sigma */
-				sc->data0 = saturate(param1);
-				sc->data1 = 0.0f;
-				sc->data2 = 0.0f;
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
+				bsdf->sigma = saturate(param1);
+				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -416,68 +346,62 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
-			if(sc) {
-				/* Normal, Size and Smooth */
-				sc->N = N;
-				sc->data0 = param1;
-				sc->data1 = param2;
-				sc->data2 = 0.0f;
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->size = param1;
+				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
 			
 			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
-				if(sc) {
+				if(bsdf) {
 					/* todo: giving a fixed weight here will cause issues when
 					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
-					sc->weight = make_float3(1.0f, 1.0f, 1.0f);
-					sc->N = N;
-					sc->data0 = 0.0f;
-					sc->data1 = 0.0f;
-					sc->data2 = 0.0f;
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
+					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
+					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
 				}
 			}
 			else {
-				ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-				sc = svm_node_closure_get_bsdf(sd, mix_weight);
+				HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
-				if(sc) {
-					sc->N = N;
-					sc->data0 = param1;
-					sc->data1 = param2;
-					sc->data2 = -stack_load_float(stack, data_node.z);
+				if(bsdf) {
+					bsdf->roughness1 = param1;
+					bsdf->roughness2 = param2;
+					bsdf->offset = -stack_load_float(stack, data_node.z);
 
 					if(stack_valid(data_node.y)) {
-						sc->T = normalize(stack_load_float3(stack, data_node.y));
+						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
 					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						sc->T = normalize(ccl_fetch(sd, dPdv));
-						sc->data2 = 0.0f;
+						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+						bsdf->offset = 0.0f;
 					}
 					else
-						sc->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(ccl_fetch(sd, dPdu));
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -487,17 +411,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 #ifdef __SUBSURFACE__
-#  ifndef __SPLIT_KERNEL__
-#    define sc_next(sc) sc++
-#  else
-#    define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
-#  endif
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
 		case CLOSURE_BSSRDF_BURLEY_ID: {
-			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-			float3 albedo = sc->weight;
-			float3 weight = sc->weight * mix_weight;
+			float3 albedo = ccl_fetch(sd, svm_closure_weight);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -506,7 +424,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;
 
-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
+			if(sample_weight > CLOSURE_WEIGHT_CUTOFF) {
 				/* radius * scale */
 				float3 radius = stack_load_float3(stack, data_node.z)*param1;
 				/* sharpness */
@@ -515,61 +433,42 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float texture_blur = param2;
 
 				/* create one closure per color channel */
-				if(fabsf(weight.x) > 0.0f) {
-					sc->weight = make_float3(weight.x, 0.0f, 0.0f);
-					sc->sample_weight = sample_weight;
-					sc->data0 = radius.x;
-					sc->data1 = texture_blur;
-					sc->data2 = albedo.x;
-					sc->T.x = sharpness;
-#  ifdef __OSL__
-					sc->prim = NULL;
-#  endif
-					sc->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
-
-					ccl_fetch(sd, num_closure)++;
-					sc_next(sc);
+				Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f));
+				if(bssrdf) {
+					bssrdf->sample_weight = sample_weight;
+					bssrdf->radius = radius.x;
+					bssrdf->texture_blur = texture_blur;
+					bssrdf->albedo = albedo.x;
+					bssrdf->sharpness = sharpness;
+					bssrdf->N = N;
+					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
-				if(fabsf(weight.y) > 0.0f) {
-					sc->weight = make_float3(0.0f, weight.y, 0.0f);
-					sc->sample_weight = sample_weight;
-					sc->data0 = radius.y;
-					sc->data1 = texture_blur;
-					sc->data2 = albedo.y;
-					sc->T.x = sharpness;
-#  ifdef __OSL__
-					sc->prim = NULL;
-#  endif
-					sc->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
-
-					ccl_fetch(sd, num_closure)++;
-					sc_next(sc);
+				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
+				if(bssrdf) {
+					bssrdf->sample_weight = sample_weight;
+					bssrdf->radius = radius.y;
+					bssrdf->texture_blur = texture_blur;
+					bssrdf->albedo = albedo.y;
+					bssrdf->sharpness = sharpness;
+					bssrdf->N = N;
+					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
-				if(fabsf(weight.z) > 0.0f) {
-					sc->weight = make_float3(0.0f, 0.0f, weight.z);
-					sc->sample_weight = sample_weight;
-					sc->data0 = radius.z;
-					sc->data1 = texture_blur;
-					sc->data2 = albedo.z;
-					sc->T.x = sharpness;
-#  ifdef __OSL__
-					sc->prim = NULL;
-#  endif
-					sc->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
-
-					ccl_fetch(sd, num_closure)++;
-					sc_next(sc);
+				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
+				if(bssrdf) {
+					bssrdf->sample_weight = sample_weight;
+					bssrdf->radius = radius.z;
+					bssrdf->texture_blur = texture_blur;
+					bssrdf->albedo = albedo.z;
+					bssrdf->sharpness = sharpness;
+					bssrdf->N = N;
+					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 			}
 
 			break;
 		}
-#  undef sc_next
 #endif
 		default:
 			break;
@@ -594,7 +493,8 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);
+			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
 
 			if(sc) {
 				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
@@ -602,13 +502,12 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			break;
 		}
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight * density);
+			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
 
-			if(sc) {
-				sc->data0 = param2; /* g */
-				sc->data1 = 0.0f;
-				sc->data2 = 0.0f;
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
+			if(volume) {
+				volume->g = param2; /* g */
+				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
 			}
 			break;
 		}
@@ -628,10 +527,10 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
 	}
 	else
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
 
 	ccl_fetch(sd, flag) |= SD_EMISSION;
 }
@@ -646,10 +545,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_BACKGROUND_ID, mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
 	}
 	else
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_BACKGROUND_ID, 1.0f);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -662,10 +561,10 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
 	}
 	else
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
 
 	ccl_fetch(sd, flag) |= SD_HOLDOUT;
 }
@@ -680,10 +579,10 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
 	}
 	else
-		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
 
 	ccl_fetch(sd, flag) |= SD_AO;
 }
@@ -692,10 +591,7 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
-		sc->weight = weight;
-	}
+	ccl_fetch(sd, svm_closure_weight) = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index bb06254c3a9..7d512f7ff4d 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,7 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint type,
+                                         uint out_offset)
 {
 	float3 data;
 
@@ -94,7 +98,11 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_particle_info(KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint out_offset)
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
@@ -146,7 +154,11 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_hair_info(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint type,
+                                   uint out_offset)
 {
 	float data;
 	float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 44732734c31..b6b90dfff81 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -271,9 +271,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
 		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
 		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
-		case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break;
-		case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break;
-		case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 3f7d18a02fe..6d13a0d8e02 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -32,21 +32,17 @@ ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type,
 		*Fac = average_fac(*Vector);
 	}
 	else if(type == NODE_VECTOR_MATH_AVERAGE) {
-		*Fac = len(Vector1 + Vector2);
-		*Vector = normalize(Vector1 + Vector2);
+		*Vector = safe_normalize_len(Vector1 + Vector2, Fac);
 	}
 	else if(type == NODE_VECTOR_MATH_DOT_PRODUCT) {
 		*Fac = dot(Vector1, Vector2);
 		*Vector = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else if(type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
-		float3 c = cross(Vector1, Vector2);
-		*Fac = len(c);
-		*Vector = normalize(c);
+		*Vector = safe_normalize_len(cross(Vector1, Vector2), Fac);
 	}
 	else if(type == NODE_VECTOR_MATH_NORMALIZE) {
-		*Fac = len(Vector1);
-		*Vector = normalize(Vector1);
+		*Vector = safe_normalize_len(Vector1, Fac);
 	}
 	else {
 		*Fac = 0.0f;
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 24275d05c4a..368740f64c7 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -19,12 +19,14 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg,
-                                  int offset,
-                                  float f,
-                                  bool interpolate,
-                                  bool extrapolate,
-                                  int table_size)
+/* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
+
+ccl_device_inline float4 rgb_ramp_lookup(KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
 	if((f < 0.0f || f > 1.0f) && extrapolate) {
 		float4 t0, dy;
@@ -75,36 +77,7 @@ ccl_device void svm_node_rgb_ramp(KernelGlobals *kg, ShaderData *sd, float *stac
 	*offset += table_size;
 }
 
-ccl_device void svm_node_rgb_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
-{
-	uint fac_offset, color_offset, out_offset;
-	decode_node_uchar4(node.y,
-	                   &fac_offset,
-	                   &color_offset,
-	                   &out_offset,
-	                   NULL);
-
-	uint table_size = read_node(kg, offset).x;
-
-	float fac = stack_load_float(stack, fac_offset);
-	float3 color = stack_load_float3(stack, color_offset);
-
-	const float min_x = __int_as_float(node.z),
-	            max_x = __int_as_float(node.w);
-	const float range_x = max_x - min_x;
-	const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
-
-	float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
-	float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
-	float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
-
-	color = (1.0f - fac)*color + fac*make_float3(r, g, b);
-	stack_store_float3(stack, out_offset, color);
-
-	*offset += table_size;
-}
-
-ccl_device void svm_node_vector_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint fac_offset, color_offset, out_offset;
 	decode_node_uchar4(node.y,
diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h
new file mode 100644
index 00000000000..9f2ce1276f9
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_ramp_util.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SVM_RAMP_UTIL_H__
+#define __SVM_RAMP_UTIL_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
+
+ccl_device_inline float3 rgb_ramp_lookup(const float3 *ramp,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
+{
+	if ((f < 0.0f || f > 1.0f) && extrapolate) {
+		float3 t0, dy;
+		if (f < 0.0f) {
+			t0 = ramp[0];
+			dy = t0 - ramp[1],
+			f = -f;
+		}
+		else {
+			t0 = ramp[table_size - 1];
+			dy = t0 - ramp[table_size - 2];
+			f = f - 1.0f;
+		}
+		return t0 + dy * f * (table_size - 1);
+	}
+
+	f = clamp(f, 0.0f, 1.0f) * (table_size - 1);
+
+	/* clamp int as well in case of NaN */
+	int i = clamp(float_to_int(f), 0, table_size-1);
+	float t = f - (float)i;
+
+	float3 result = ramp[i];
+
+	if (interpolate && t > 0.0f)
+		result = (1.0f - t) * result + t * ramp[i + 1];
+
+	return result;
+}
+
+ccl_device float float_ramp_lookup(const float *ramp,
+                                   float f,
+                                   bool interpolate,
+                                   bool extrapolate,
+                                   int table_size)
+{
+	if ((f < 0.0f || f > 1.0f) && extrapolate) {
+		float t0, dy;
+		if (f < 0.0f) {
+			t0 = ramp[0];
+			dy = t0 - ramp[1],
+			f = -f;
+		}
+		else {
+			t0 = ramp[table_size - 1];
+			dy = t0 - ramp[table_size - 2];
+			f = f - 1.0f;
+		}
+		return t0 + dy * f * (table_size - 1);
+	}
+
+	f = clamp(f, 0.0f, 1.0f) * (table_size - 1);
+
+	/* clamp int as well in case of NaN */
+	int i = clamp(float_to_int(f), 0, table_size-1);
+	float t = f - (float)i;
+
+	float result = ramp[i];
+
+	if (interpolate && t > 0.0f)
+		result = (1.0f - t) * result + t * ramp[i + 1];
+
+	return result;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __SVM_RAMP_UTIL_H__ */
+
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 276b6f26f5e..01dede3fff5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -99,12 +99,12 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 	stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
-                                                  ShaderData *sd,
-                                                  int path_flag,
-                                                  float *stack,
-                                                  uint4 node,
-                                                  int *offset)
+ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -184,12 +184,12 @@ ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_inline void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
-                                                  ShaderData *sd,
-                                                  int path_flag,
-                                                  float *stack,
-                                                  uint4 node,
-                                                  int *offset)
+ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -287,23 +287,22 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		}
 
 		/* first try to get tangent attribute */
-		AttributeElement attr_elem, attr_sign_elem, attr_normal_elem;
-		int attr_offset = find_attribute(kg, sd, node.z, &attr_elem);
-		int attr_sign_offset = find_attribute(kg, sd, node.w, &attr_sign_elem);
-		int attr_normal_offset = find_attribute(kg, sd, ATTR_STD_VERTEX_NORMAL, &attr_normal_elem);
+		const AttributeDescriptor attr = find_attribute(kg, sd, node.z);
+		const AttributeDescriptor attr_sign = find_attribute(kg, sd, node.w);
+		const AttributeDescriptor attr_normal = find_attribute(kg, sd, ATTR_STD_VERTEX_NORMAL);
 
-		if(attr_offset == ATTR_STD_NOT_FOUND || attr_sign_offset == ATTR_STD_NOT_FOUND || attr_normal_offset == ATTR_STD_NOT_FOUND) {
+		if(attr.offset == ATTR_STD_NOT_FOUND || attr_sign.offset == ATTR_STD_NOT_FOUND || attr_normal.offset == ATTR_STD_NOT_FOUND) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
 
 		/* get _unnormalized_ interpolated normal and tangent */
-		float3 tangent = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
-		float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
+		float3 tangent = primitive_attribute_float3(kg, sd, attr, NULL, NULL);
+		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
 		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
+			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
 			normal = ccl_fetch(sd, Ng);
@@ -356,24 +355,22 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 
 	if(direction_type == NODE_TANGENT_UVMAP) {
 		/* UV map */
-		AttributeElement attr_elem;
-		int attr_offset = find_attribute(kg, sd, node.z, &attr_elem);
+		const AttributeDescriptor desc = find_attribute(kg, sd, node.z);
 
-		if(attr_offset == ATTR_STD_NOT_FOUND)
+		if(desc.offset == ATTR_STD_NOT_FOUND)
 			tangent = make_float3(0.0f, 0.0f, 0.0f);
 		else
-			tangent = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
+			tangent = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 	}
 	else {
 		/* radial */
-		AttributeElement attr_elem;
-		int attr_offset = find_attribute(kg, sd, node.z, &attr_elem);
+		const AttributeDescriptor desc = find_attribute(kg, sd, node.z);
 		float3 generated;
 
-		if(attr_offset == ATTR_STD_NOT_FOUND)
+		if(desc.offset == ATTR_STD_NOT_FOUND)
 			generated = ccl_fetch(sd, P);
 		else
-			generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
+			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 		if(axis == NODE_TANGENT_AXIS_X)
 			tangent = make_float3(0.0f, -(generated.z - 0.5f), (generated.y - 0.5f));
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index e1a8ced6a34..51083c31708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -449,6 +449,9 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
 #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
+#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
+                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
+											type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 30ccd523add..6eed9bc1a99 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -34,11 +34,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Wireframe Node */
 
-ccl_device float wireframe(KernelGlobals *kg,
-                           ShaderData *sd,
-                           float size,
-                           int pixel_size,
-                           float3 *P)
+ccl_device_inline float wireframe(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  float size,
+                                  int pixel_size,
+                                  float3 *P)
 {
 #ifdef __HAIR__
 	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index a632ddc0598..8eaa9de3874 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -30,6 +30,7 @@ set(SRC
 	light.cpp
 	mesh.cpp
 	mesh_displace.cpp
+	mesh_subdivision.cpp
 	nodes.cpp
 	object.cpp
 	osl.cpp
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index 71a3cba6811..c0d429a583c 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -44,6 +44,7 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 	type = type_;
 	element = element_;
 	std = ATTR_STD_NONE;
+	flags = 0;
 
 	/* string and matrix not supported! */
 	assert(type == TypeDesc::TypeFloat || type == TypeDesc::TypeColor ||
@@ -51,16 +52,21 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 		type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix);
 }
 
-void Attribute::resize(int numverts, int numtris, int numsteps, int numcurves, int numkeys, bool reserve_only)
+void Attribute::resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only)
 {
 	if(reserve_only) {
-		buffer.reserve(buffer_size(numverts, numtris, numsteps, numcurves, numkeys));
+		buffer.reserve(buffer_size(mesh, prim));
 	}
 	else {
-		buffer.resize(buffer_size(numverts, numtris, numsteps, numcurves, numkeys), 0);
+		buffer.resize(buffer_size(mesh, prim), 0);
 	}
 }
 
+void Attribute::resize(size_t num_elements)
+{
+	buffer.resize(num_elements * data_sizeof(), 0);
+}
+
 void Attribute::add(const float& f)
 {
 	char *data = (char*)&f;
@@ -118,6 +124,8 @@ size_t Attribute::data_sizeof() const
 {
 	if(element == ATTR_ELEMENT_VOXEL)
 		return sizeof(VoxelAttribute);
+	else if(element == ATTR_ELEMENT_CORNER_BYTE)
+		return sizeof(uchar4);
 	else if(type == TypeDesc::TypeFloat)
 		return sizeof(float);
 	else if(type == TypeDesc::TypeMatrix)
@@ -126,10 +134,14 @@ size_t Attribute::data_sizeof() const
 		return sizeof(float3);
 }
 
-size_t Attribute::element_size(int numverts, int numtris, int numsteps, int numcurves, int numkeys) const
+size_t Attribute::element_size(Mesh *mesh, AttributePrimitive prim) const
 {
+	if(flags & ATTR_FINAL_SIZE) {
+		return buffer.size() / data_sizeof();
+	}
+
 	size_t size;
-	
+
 	switch(element) {
 		case ATTR_ELEMENT_OBJECT:
 		case ATTR_ELEMENT_MESH:
@@ -137,38 +149,54 @@ size_t Attribute::element_size(int numverts, int numtris, int numsteps, int numc
 			size = 1;
 			break;
 		case ATTR_ELEMENT_VERTEX:
-			size = numverts;
+			size = mesh->verts.size() + mesh->num_ngons;
+			if(prim == ATTR_PRIM_SUBD) {
+				size -= mesh->num_subd_verts;
+			}
 			break;
 		case ATTR_ELEMENT_VERTEX_MOTION:
-			size = numverts * (numsteps - 1);
+			size = (mesh->verts.size() + mesh->num_ngons) * (mesh->motion_steps - 1);
+			if(prim == ATTR_PRIM_SUBD) {
+				size -= mesh->num_subd_verts * (mesh->motion_steps - 1);
+			}
 			break;
 		case ATTR_ELEMENT_FACE:
-			size = numtris;
+			if(prim == ATTR_PRIM_TRIANGLE) {
+				size = mesh->num_triangles();
+			}
+			else {
+				size = mesh->subd_faces.size() + mesh->num_ngons;
+			}
 			break;
 		case ATTR_ELEMENT_CORNER:
 		case ATTR_ELEMENT_CORNER_BYTE:
-			size = numtris*3;
+			if(prim == ATTR_PRIM_TRIANGLE) {
+				size = mesh->num_triangles()*3;
+			}
+			else {
+				size = mesh->subd_face_corners.size() + mesh->num_ngons;
+			}
 			break;
 		case ATTR_ELEMENT_CURVE:
-			size = numcurves;
+			size = mesh->num_curves();
 			break;
 		case ATTR_ELEMENT_CURVE_KEY:
-			size = numkeys;
+			size = mesh->curve_keys.size();
 			break;
 		case ATTR_ELEMENT_CURVE_KEY_MOTION:
-			size = numkeys * (numsteps - 1);
+			size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
 			break;
 		default:
 			size = 0;
 			break;
 	}
-	
+
 	return size;
 }
 
-size_t Attribute::buffer_size(int numverts, int numtris, int numsteps, int numcurves, int numkeys) const
+size_t Attribute::buffer_size(Mesh *mesh, AttributePrimitive prim) const
 {
-	return element_size(numverts, numtris, numsteps, numcurves, numkeys)*data_sizeof();
+	return element_size(mesh, prim)*data_sizeof();
 }
 
 bool Attribute::same_storage(TypeDesc a, TypeDesc b)
@@ -188,6 +216,29 @@ bool Attribute::same_storage(TypeDesc a, TypeDesc b)
 	return false;
 }
 
+void Attribute::zero_data(void* dst)
+{
+	memset(dst, 0, data_sizeof());
+}
+
+void Attribute::add_with_weight(void* dst, void* src, float weight)
+{
+	if(element == ATTR_ELEMENT_CORNER_BYTE) {
+		for(int i = 0; i < 4; i++) {
+			((uchar*)dst)[i] += uchar(((uchar*)src)[i] * weight);
+		}
+	}
+	else if(same_storage(type, TypeDesc::TypeFloat)) {
+		*((float*)dst) += *((float*)src) * weight;
+	}
+	else if(same_storage(type, TypeDesc::TypeVector)) {
+		*((float4*)dst) += *((float4*)src) * weight;
+	}
+	else {
+		assert(!"not implemented for this type");
+	}
+}
+
 const char *Attribute::standard_name(AttributeStandard std)
 {
 	switch(std) {
@@ -257,6 +308,7 @@ AttributeSet::AttributeSet()
 {
 	triangle_mesh = NULL;
 	curve_mesh = NULL;
+	subd_mesh = NULL;
 }
 
 AttributeSet::~AttributeSet()
@@ -291,10 +343,12 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme
 
 	/* this is weak .. */
 	if(triangle_mesh)
-		attr->resize(triangle_mesh->verts.size(), triangle_mesh->num_triangles(), triangle_mesh->motion_steps, 0, 0, false);
+		attr->resize(triangle_mesh, ATTR_PRIM_TRIANGLE, false);
 	if(curve_mesh)
-		attr->resize(0, 0, curve_mesh->motion_steps, curve_mesh->num_curves(), curve_mesh->curve_keys.size(), false);
-	
+		attr->resize(curve_mesh, ATTR_PRIM_CURVE, false);
+	if(subd_mesh)
+		attr->resize(subd_mesh, ATTR_PRIM_SUBD, false);
+
 	return attr;
 }
 
@@ -330,7 +384,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 	if(name == ustring())
 		name = Attribute::standard_name(std);
 
-	if(triangle_mesh) {
+	if(triangle_mesh || subd_mesh) {
 		switch(std) {
 			case ATTR_STD_VERTEX_NORMAL:
 				attr = add(name, TypeDesc::TypeNormal, ATTR_ELEMENT_VERTEX);
@@ -452,9 +506,11 @@ void AttributeSet::resize(bool reserve_only)
 {
 	foreach(Attribute& attr, attributes) {
 		if(triangle_mesh)
-			attr.resize(triangle_mesh->verts.size(), triangle_mesh->num_triangles(), triangle_mesh->motion_steps, 0, 0, reserve_only);
+			attr.resize(triangle_mesh, ATTR_PRIM_TRIANGLE, reserve_only);
 		if(curve_mesh)
-			attr.resize(0, 0, 0, curve_mesh->num_curves(), curve_mesh->curve_keys.size(), reserve_only);
+			attr.resize(curve_mesh, ATTR_PRIM_CURVE, reserve_only);
+		if(subd_mesh)
+			attr.resize(subd_mesh, ATTR_PRIM_SUBD, reserve_only);
 	}
 }
 
@@ -471,12 +527,19 @@ AttributeRequest::AttributeRequest(ustring name_)
 	std = ATTR_STD_NONE;
 
 	triangle_type = TypeDesc::TypeFloat;
-	triangle_element = ATTR_ELEMENT_NONE;
-	triangle_offset = 0;
+	triangle_desc.element = ATTR_ELEMENT_NONE;
+	triangle_desc.offset = 0;
+	triangle_desc.type = NODE_ATTR_FLOAT;
 
 	curve_type = TypeDesc::TypeFloat;
-	curve_element = ATTR_ELEMENT_NONE;
-	curve_offset = 0;
+	curve_desc.element = ATTR_ELEMENT_NONE;
+	curve_desc.offset = 0;
+	curve_desc.type = NODE_ATTR_FLOAT;
+
+	subd_type = TypeDesc::TypeFloat;
+	subd_desc.element = ATTR_ELEMENT_NONE;
+	subd_desc.offset = 0;
+	subd_desc.type = NODE_ATTR_FLOAT;
 }
 
 AttributeRequest::AttributeRequest(AttributeStandard std_)
@@ -485,12 +548,19 @@ AttributeRequest::AttributeRequest(AttributeStandard std_)
 	std = std_;
 
 	triangle_type = TypeDesc::TypeFloat;
-	triangle_element = ATTR_ELEMENT_NONE;
-	triangle_offset = 0;
+	triangle_desc.element = ATTR_ELEMENT_NONE;
+	triangle_desc.offset = 0;
+	triangle_desc.type = NODE_ATTR_FLOAT;
 
 	curve_type = TypeDesc::TypeFloat;
-	curve_element = ATTR_ELEMENT_NONE;
-	curve_offset = 0;
+	curve_desc.element = ATTR_ELEMENT_NONE;
+	curve_desc.offset = 0;
+	curve_desc.type = NODE_ATTR_FLOAT;
+
+	subd_type = TypeDesc::TypeFloat;
+	subd_desc.element = ATTR_ELEMENT_NONE;
+	subd_desc.offset = 0;
+	subd_desc.type = NODE_ATTR_FLOAT;
 }
 
 /* AttributeRequestSet */
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index 41b3626afd3..f4538c76369 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -54,15 +54,17 @@ public:
 	TypeDesc type;
 	vector<char> buffer;
 	AttributeElement element;
+	uint flags; /* enum AttributeFlag */
 
 	Attribute() {}
 	~Attribute();
 	void set(ustring name, TypeDesc type, AttributeElement element);
-	void resize(int numverts, int numfaces, int numsteps, int numcurves, int numkeys, bool reserve_only);
+	void resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only);
+	void resize(size_t num_elements);
 
 	size_t data_sizeof() const;
-	size_t element_size(int numverts, int numfaces, int numsteps, int numcurves, int numkeys) const;
-	size_t buffer_size(int numverts, int numfaces, int numsteps, int numcurves, int numkeys) const;
+	size_t element_size(Mesh *mesh, AttributePrimitive prim) const;
+	size_t buffer_size(Mesh *mesh, AttributePrimitive prim) const;
 
 	char *data() { return (buffer.size())? &buffer[0]: NULL; };
 	float3 *data_float3() { return (float3*)data(); }
@@ -79,6 +81,9 @@ public:
 	const Transform *data_transform() const { return (const Transform*)data(); }
 	const VoxelAttribute *data_voxel() const { return (const VoxelAttribute*)data(); }
 
+	void zero_data(void* dst);
+	void add_with_weight(void* dst, void* src, float weight);
+
 	void add(const float& f);
 	void add(const float3& f);
 	void add(const uchar4& f);
@@ -99,6 +104,7 @@ class AttributeSet {
 public:
 	Mesh *triangle_mesh;
 	Mesh *curve_mesh;
+	Mesh *subd_mesh;
 	list<Attribute> attributes;
 
 	AttributeSet();
@@ -130,9 +136,8 @@ public:
 	AttributeStandard std;
 
 	/* temporary variables used by MeshManager */
-	TypeDesc triangle_type, curve_type;
-	AttributeElement triangle_element, curve_element;
-	int triangle_offset, curve_offset;
+	TypeDesc triangle_type, curve_type, subd_type;
+	AttributeDescriptor triangle_desc, curve_desc, subd_desc;
 
 	explicit AttributeRequest(ustring name_);
 	explicit AttributeRequest(AttributeStandard std);
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index 1fee6b2c081..200a4c497cd 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -18,6 +18,7 @@
 #include "graph.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -39,6 +40,8 @@ bool ConstantFolder::all_inputs_constant() const
 
 void ConstantFolder::make_constant(float value) const
 {
+	VLOG(1) << "Folding " << node->name << "::" << output->name() << " to constant (" << value << ").";
+
 	foreach(ShaderInput *sock, output->links) {
 		sock->set(value);
 	}
@@ -48,6 +51,8 @@ void ConstantFolder::make_constant(float value) const
 
 void ConstantFolder::make_constant(float3 value) const
 {
+	VLOG(1) << "Folding " << node->name << "::" << output->name() << " to constant " << value << ".";
+
 	foreach(ShaderInput *sock, output->links) {
 		sock->set(value);
 	}
@@ -62,7 +67,7 @@ void ConstantFolder::make_constant_clamp(float value, bool clamp) const
 
 void ConstantFolder::make_constant_clamp(float3 value, bool clamp) const
 {
-	if (clamp) {
+	if(clamp) {
 		value.x = saturate(value.x);
 		value.y = saturate(value.y);
 		value.z = saturate(value.z);
@@ -71,10 +76,25 @@ void ConstantFolder::make_constant_clamp(float3 value, bool clamp) const
 	make_constant(value);
 }
 
+void ConstantFolder::make_zero() const
+{
+	if(output->type() == SocketType::FLOAT) {
+		make_constant(0.0f);
+	}
+	else if(SocketType::is_float3(output->type())) {
+		make_constant(make_float3(0.0f, 0.0f, 0.0f));
+	}
+	else {
+		assert(0);
+	}
+}
+
 void ConstantFolder::bypass(ShaderOutput *new_output) const
 {
 	assert(new_output);
 
+	VLOG(1) << "Folding " << node->name << "::" << output->name() << " to socket " << new_output->parent->name << "::" << new_output->name() << ".";
+
 	/* Remove all outgoing links from socket and connect them to new_output instead.
 	 * The graph->relink method affects node inputs, so it's not safe to use in constant
 	 * folding if the node has multiple outputs and will thus be folded multiple times. */
@@ -90,6 +110,9 @@ void ConstantFolder::bypass(ShaderOutput *new_output) const
 void ConstantFolder::discard() const
 {
 	assert(output->type() == SocketType::CLOSURE);
+
+	VLOG(1) << "Discarding closure " << node->name << ".";
+
 	graph->disconnect(output);
 }
 
@@ -97,7 +120,7 @@ void ConstantFolder::bypass_or_discard(ShaderInput *input) const
 {
 	assert(input->type() == SocketType::CLOSURE);
 
-	if (input->link) {
+	if(input->link) {
 		bypass(input->link);
 	}
 	else {
@@ -105,11 +128,20 @@ void ConstantFolder::bypass_or_discard(ShaderInput *input) const
 	}
 }
 
-bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, float3 input_value, bool clamp) const
+bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp) const
 {
-	if(!input->link) {
-		make_constant_clamp(input_value, clamp);
-		return true;
+	if(input->type() != output->type()) {
+		return false;
+	}
+	else if(!input->link) {
+		if(input->type() == SocketType::FLOAT) {
+			make_constant_clamp(node->get_float(input->socket_type), clamp);
+			return true;
+		}
+		else if(SocketType::is_float3(input->type())) {
+			make_constant_clamp(node->get_float3(input->socket_type), clamp);
+			return true;
+		}
 	}
 	else if(!clamp) {
 		bypass(input->link);
@@ -119,4 +151,212 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, float3 inpu
 	return false;
 }
 
+bool ConstantFolder::is_zero(ShaderInput *input) const
+{
+	if(!input->link) {
+		if(input->type() == SocketType::FLOAT) {
+			return node->get_float(input->socket_type) == 0.0f;
+		}
+		else if(SocketType::is_float3(input->type())) {
+			return node->get_float3(input->socket_type) ==
+			       make_float3(0.0f, 0.0f, 0.0f);
+		}
+	}
+
+	return false;
+}
+
+bool ConstantFolder::is_one(ShaderInput *input) const
+{
+	if(!input->link) {
+		if(input->type() == SocketType::FLOAT) {
+			return node->get_float(input->socket_type) == 1.0f;
+		}
+		else if(SocketType::is_float3(input->type())) {
+			return node->get_float3(input->socket_type) ==
+			       make_float3(1.0f, 1.0f, 1.0f);
+		}
+	}
+
+	return false;
+}
+
+/* Specific nodes */
+
+void ConstantFolder::fold_mix(NodeMix type, bool clamp) const
+{
+    ShaderInput *fac_in = node->input("Fac");
+    ShaderInput *color1_in = node->input("Color1");
+    ShaderInput *color2_in = node->input("Color2");
+
+	float fac = saturate(node->get_float(fac_in->socket_type));
+	bool fac_is_zero = !fac_in->link && fac == 0.0f;
+	bool fac_is_one = !fac_in->link && fac == 1.0f;
+
+	/* remove no-op node when factor is 0.0 */
+	if(fac_is_zero) {
+		/* note that some of the modes will clamp out of bounds values even without use_clamp */
+		if(!(type == NODE_MIX_LIGHT || type == NODE_MIX_DODGE || type == NODE_MIX_BURN)) {
+			if(try_bypass_or_make_constant(color1_in, clamp)) {
+				return;
+			}
+		}
+	}
+
+	switch(type) {
+		case NODE_MIX_BLEND:
+			/* remove useless mix colors nodes */
+			if(color1_in->link && color2_in->link) {
+				if(color1_in->link == color2_in->link) {
+					try_bypass_or_make_constant(color1_in, clamp);
+					break;
+				}
+			}
+			else if(!color1_in->link && !color2_in->link) {
+				float3 color1 = node->get_float3(color1_in->socket_type);
+				float3 color2 = node->get_float3(color2_in->socket_type);
+				if(color1 == color2) {
+					try_bypass_or_make_constant(color1_in, clamp);
+					break;
+				}
+			}
+			/* remove no-op mix color node when factor is 1.0 */
+			if(fac_is_one) {
+				try_bypass_or_make_constant(color2_in, clamp);
+				break;
+			}
+			break;
+		case NODE_MIX_ADD:
+			/* 0 + X (fac 1) == X */
+			if(is_zero(color1_in) && fac_is_one) {
+				try_bypass_or_make_constant(color2_in, clamp);
+			}
+			/* X + 0 (fac ?) == X */
+			else if(is_zero(color2_in)) {
+				try_bypass_or_make_constant(color1_in, clamp);
+			}
+			break;
+		case NODE_MIX_SUB:
+			/* X - 0 (fac ?) == X */
+			if(is_zero(color2_in)) {
+				try_bypass_or_make_constant(color1_in, clamp);
+			}
+			/* X - X (fac 1) == 0 */
+			else if(color1_in->link && color1_in->link == color2_in->link && fac_is_one) {
+				make_zero();
+			}
+			break;
+		case NODE_MIX_MUL:
+			/* X * 1 (fac ?) == X, 1 * X (fac 1) == X */
+			if(is_one(color1_in) && fac_is_one) {
+				try_bypass_or_make_constant(color2_in, clamp);
+			}
+			else if(is_one(color2_in)) {
+				try_bypass_or_make_constant(color1_in, clamp);
+			}
+			/* 0 * ? (fac ?) == 0, ? * 0 (fac 1) == 0 */
+			else if(is_zero(color1_in)) {
+				make_zero();
+			}
+			else if(is_zero(color2_in) && fac_is_one) {
+				make_zero();
+			}
+			break;
+		case NODE_MIX_DIV:
+			/* X / 1 (fac ?) == X */
+			if(is_one(color2_in)) {
+				try_bypass_or_make_constant(color1_in, clamp);
+			}
+			/* 0 / ? (fac ?) == 0 */
+			else if(is_zero(color1_in)) {
+				make_zero();
+			}
+			break;
+		default:
+			break;
+	}
+}
+
+void ConstantFolder::fold_math(NodeMath type, bool clamp) const
+{
+	ShaderInput *value1_in = node->input("Value1");
+	ShaderInput *value2_in = node->input("Value2");
+
+	switch(type) {
+		case NODE_MATH_ADD:
+			/* X + 0 == 0 + X == X */
+			if(is_zero(value1_in)) {
+				try_bypass_or_make_constant(value2_in, clamp);
+			}
+			else if(is_zero(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
+			break;
+		case NODE_MATH_SUBTRACT:
+			/* X - 0 == X */
+			if(is_zero(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
+			break;
+		case NODE_MATH_MULTIPLY:
+			/* X * 1 == 1 * X == X */
+			if(is_one(value1_in)) {
+				try_bypass_or_make_constant(value2_in, clamp);
+			}
+			else if(is_one(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
+			/* X * 0 == 0 * X == 0 */
+			else if(is_zero(value1_in) || is_zero(value2_in)) {
+				make_zero();
+			}
+			break;
+		case NODE_MATH_DIVIDE:
+			/* X / 1 == X */
+			if(is_one(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
+			/* 0 / X == 0 */
+			else if(is_zero(value1_in)) {
+				make_zero();
+			}
+			break;
+		default:
+			break;
+	}
+}
+
+void ConstantFolder::fold_vector_math(NodeVectorMath type) const
+{
+	ShaderInput *vector1_in = node->input("Vector1");
+	ShaderInput *vector2_in = node->input("Vector2");
+
+	switch(type) {
+		case NODE_VECTOR_MATH_ADD:
+			/* X + 0 == 0 + X == X */
+			if(is_zero(vector1_in)) {
+				try_bypass_or_make_constant(vector2_in);
+			}
+			else if(is_zero(vector2_in)) {
+				try_bypass_or_make_constant(vector1_in);
+			}
+			break;
+		case NODE_VECTOR_MATH_SUBTRACT:
+			/* X - 0 == X */
+			if(is_zero(vector2_in)) {
+				try_bypass_or_make_constant(vector1_in);
+			}
+			break;
+		case NODE_VECTOR_MATH_DOT_PRODUCT:
+		case NODE_VECTOR_MATH_CROSS_PRODUCT:
+			/* X * 0 == 0 * X == 0 */
+			if(is_zero(vector1_in) || is_zero(vector2_in)) {
+				make_zero();
+			}
+			break;
+		default:
+			break;
+	}
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 978c8e5335a..2b31c2a5887 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -18,6 +18,7 @@
 #define __CONSTANT_FOLD_H__
 
 #include "util_types.h"
+#include "svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -36,11 +37,12 @@ public:
 
 	bool all_inputs_constant() const;
 
-	/* Constant folding helpers, always return true for convenience. */
+	/* Constant folding helpers */
 	void make_constant(float value) const;
 	void make_constant(float3 value) const;
 	void make_constant_clamp(float value, bool clamp) const;
 	void make_constant_clamp(float3 value, bool clamp) const;
+	void make_zero() const;
 
 	/* Bypass node, relinking to another output socket. */
 	void bypass(ShaderOutput *output) const;
@@ -50,7 +52,16 @@ public:
 	void bypass_or_discard(ShaderInput *input) const;
 
 	/* Bypass or make constant, unless we can't due to clamp being true. */
-	bool try_bypass_or_make_constant(ShaderInput *input, float3 input_value, bool clamp) const;
+	bool try_bypass_or_make_constant(ShaderInput *input, bool clamp = false) const;
+
+	/* Test if shader inputs of the current nodes have fixed values. */
+	bool is_zero(ShaderInput *input) const;
+	bool is_one(ShaderInput *input) const;
+
+	/* Specific nodes. */
+	void fold_mix(NodeMix type, bool clamp) const;
+	void fold_math(NodeMath type, bool clamp) const;
+	void fold_vector_math(NodeVectorMath type) const;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 66601fa3502..6e795ef896a 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -24,6 +24,7 @@
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_queue.h"
+#include "util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -543,6 +544,7 @@ void ShaderGraph::deduplicate_nodes()
 	ShaderNodeSet scheduled, done;
 	map<ustring, ShaderNodeSet> candidates;
 	queue<ShaderNode*> traverse_queue;
+	int num_deduplicated = 0;
 
 	/* Schedule nodes which doesn't have any dependencies. */
 	foreach(ShaderNode *node, nodes) {
@@ -557,8 +559,10 @@ void ShaderGraph::deduplicate_nodes()
 		traverse_queue.pop();
 		done.insert(node);
 		/* Schedule the nodes which were depending on the current node. */
+		bool has_output_links = false;
 		foreach(ShaderOutput *output, node->outputs) {
 			foreach(ShaderInput *input, output->links) {
+				has_output_links = true;
 				if(scheduled.find(input->parent) != scheduled.end()) {
 					/* Node might not be optimized yet but scheduled already
 					 * by other dependencies. No need to re-schedule it.
@@ -572,6 +576,10 @@ void ShaderGraph::deduplicate_nodes()
 				}
 			}
 		}
+		/* Only need to care about nodes that are actually used */
+		if(!has_output_links) {
+			continue;
+		}
 		/* Try to merge this node with another one. */
 		ShaderNode *merge_with = NULL;
 		foreach(ShaderNode *other_node, candidates[node->type->name]) {
@@ -585,11 +593,16 @@ void ShaderGraph::deduplicate_nodes()
 			for(int i = 0; i < node->outputs.size(); ++i) {
 				relink(node, node->outputs[i], merge_with->outputs[i]);
 			}
+			num_deduplicated++;
 		}
 		else {
 			candidates[node->type->name].insert(node);
 		}
 	}
+
+	if(num_deduplicated > 0) {
+		VLOG(1) << "Deduplicated " << num_deduplicated << " nodes.";
+	}
 }
 
 void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack)
@@ -967,6 +980,9 @@ int ShaderGraph::get_num_closures()
 		else if(CLOSURE_IS_GLASS(closure_type)) {
 			num_closures += 2;
 		}
+		else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) {
+			num_closures += 2;
+		}
 		else {
 			++num_closures;
 		}
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 9ef35820254..4cd77f8c6e1 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -209,57 +209,73 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 	}
 }
 
+bool LightManager::object_usable_as_light(Object *object) {
+	Mesh *mesh = object->mesh;
+	/* Skip if we are not visible for BSDFs. */
+	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
+		return false;
+	}
+	/* Skip motion blurred deforming meshes, not supported yet. */
+	if(mesh->has_motion_blur()) {
+		return false;
+	}
+	/* Skip if we have no emission shaders. */
+	/* TODO(sergey): Ideally we want to avoid such duplicated loop, since it'll
+	 * iterate all mesh shaders twice (when counting and when calculating
+	 * triangle area.
+	 */
+	foreach(const Shader *shader, mesh->used_shaders) {
+		if(shader->use_mis && shader->has_surface_emission) {
+			return true;
+		}
+	}
+	return false;
+}
+
 void LightManager::device_update_distribution(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	progress.set_status("Updating Lights", "Computing distribution");
 
 	/* count */
 	size_t num_lights = 0;
+	size_t num_portals = 0;
 	size_t num_background_lights = 0;
 	size_t num_triangles = 0;
 
 	bool background_mis = false;
 
 	foreach(Light *light, scene->lights) {
-		if(light->is_enabled)
+		if(light->is_enabled) {
 			num_lights++;
+		}
+		if(light->is_portal) {
+			num_portals++;
+		}
 	}
 
 	foreach(Object *object, scene->objects) {
-		Mesh *mesh = object->mesh;
-		bool have_emission = false;
-
-		/* skip if we are not visible for BSDFs */
-		if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT)))
-			continue;
+		if(progress.get_cancel()) return;
 
-		/* skip motion blurred deforming meshes, not supported yet */
-		if(mesh->has_motion_blur())
+		if(!object_usable_as_light(object)) {
 			continue;
-
-		/* skip if we have no emission shaders */
-		foreach(Shader *shader, mesh->used_shaders) {
-			if(shader->use_mis && shader->has_surface_emission) {
-				have_emission = true;
-				break;
-			}
 		}
+		/* Count triangles. */
+		Mesh *mesh = object->mesh;
+		size_t mesh_num_triangles = mesh->num_triangles();
+		for(size_t i = 0; i < mesh_num_triangles; i++) {
+			int shader_index = mesh->shader[i];
+			Shader *shader = (shader_index < mesh->used_shaders.size())
+			                         ? mesh->used_shaders[shader_index]
+			                         : scene->default_surface;
 
-		/* count triangles */
-		if(have_emission) {
-			size_t mesh_num_triangles = mesh->num_triangles();
-			for(size_t i = 0; i < mesh_num_triangles; i++) {
-				int shader_index = mesh->shader[i];
-				Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-					mesh->used_shaders[shader_index] : scene->default_surface;
-
-				if(shader->use_mis && shader->has_surface_emission)
-					num_triangles++;
+			if(shader->use_mis && shader->has_surface_emission) {
+				num_triangles++;
 			}
 		}
 	}
 
 	size_t num_distribution = num_triangles + num_lights;
+	VLOG(1) << "Total " << num_distribution << " of light distribution primitives.";
 
 	/* emission area */
 	float4 *distribution = dscene->light_distribution.resize(num_distribution + 1);
@@ -270,87 +286,68 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	int j = 0;
 
 	foreach(Object *object, scene->objects) {
-		Mesh *mesh = object->mesh;
-		bool have_emission = false;
+		if(progress.get_cancel()) return;
 
-		/* skip if we are not visible for BSDFs */
-		if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
+		if(!object_usable_as_light(object)) {
 			j++;
 			continue;
 		}
+		/* Sum area. */
+		Mesh *mesh = object->mesh;
+		bool transform_applied = mesh->transform_applied;
+		Transform tfm = object->tfm;
+		int object_id = j;
+		int shader_flag = 0;
 
-		/* skip motion blurred deforming meshes, not supported yet */
-		if(mesh->has_motion_blur()) {
-			j++;
-			continue;
-		}
+		if(transform_applied)
+			object_id = ~object_id;
 
-		/* skip if we have no emission shaders */
-		foreach(Shader *shader, mesh->used_shaders) {
-			if(shader->use_mis && shader->has_surface_emission) {
-				have_emission = true;
-				break;
-			}
+		if(!(object->visibility & PATH_RAY_DIFFUSE)) {
+			shader_flag |= SHADER_EXCLUDE_DIFFUSE;
+			use_light_visibility = true;
+		}
+		if(!(object->visibility & PATH_RAY_GLOSSY)) {
+			shader_flag |= SHADER_EXCLUDE_GLOSSY;
+			use_light_visibility = true;
+		}
+		if(!(object->visibility & PATH_RAY_TRANSMIT)) {
+			shader_flag |= SHADER_EXCLUDE_TRANSMIT;
+			use_light_visibility = true;
+		}
+		if(!(object->visibility & PATH_RAY_VOLUME_SCATTER)) {
+			shader_flag |= SHADER_EXCLUDE_SCATTER;
+			use_light_visibility = true;
 		}
 
-		/* sum area */
-		if(have_emission) {
-			bool transform_applied = mesh->transform_applied;
-			Transform tfm = object->tfm;
-			int object_id = j;
-			int shader_flag = 0;
-
-			if(transform_applied)
-				object_id = ~object_id;
+		size_t mesh_num_triangles = mesh->num_triangles();
+		for(size_t i = 0; i < mesh_num_triangles; i++) {
+			int shader_index = mesh->shader[i];
+			Shader *shader = (shader_index < mesh->used_shaders.size())
+			                         ? mesh->used_shaders[shader_index]
+			                         : scene->default_surface;
 
-			if(!(object->visibility & PATH_RAY_DIFFUSE)) {
-				shader_flag |= SHADER_EXCLUDE_DIFFUSE;
-				use_light_visibility = true;
-			}
-			if(!(object->visibility & PATH_RAY_GLOSSY)) {
-				shader_flag |= SHADER_EXCLUDE_GLOSSY;
-				use_light_visibility = true;
-			}
-			if(!(object->visibility & PATH_RAY_TRANSMIT)) {
-				shader_flag |= SHADER_EXCLUDE_TRANSMIT;
-				use_light_visibility = true;
-			}
-			if(!(object->visibility & PATH_RAY_VOLUME_SCATTER)) {
-				shader_flag |= SHADER_EXCLUDE_SCATTER;
-				use_light_visibility = true;
-			}
-
-			size_t mesh_num_triangles = mesh->num_triangles();
-			for(size_t i = 0; i < mesh_num_triangles; i++) {
-				int shader_index = mesh->shader[i];
-				Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-					mesh->used_shaders[shader_index] : scene->default_surface;
-
-				if(shader->use_mis && shader->has_surface_emission) {
-					distribution[offset].x = totarea;
-					distribution[offset].y = __int_as_float(i + mesh->tri_offset);
-					distribution[offset].z = __int_as_float(shader_flag);
-					distribution[offset].w = __int_as_float(object_id);
-					offset++;
-
-					Mesh::Triangle t = mesh->get_triangle(i);
-					float3 p1 = mesh->verts[t.v[0]];
-					float3 p2 = mesh->verts[t.v[1]];
-					float3 p3 = mesh->verts[t.v[2]];
-
-					if(!transform_applied) {
-						p1 = transform_point(&tfm, p1);
-						p2 = transform_point(&tfm, p2);
-						p3 = transform_point(&tfm, p3);
-					}
-
-					totarea += triangle_area(p1, p2, p3);
+			if(shader->use_mis && shader->has_surface_emission) {
+				distribution[offset].x = totarea;
+				distribution[offset].y = __int_as_float(i + mesh->tri_offset);
+				distribution[offset].z = __int_as_float(shader_flag);
+				distribution[offset].w = __int_as_float(object_id);
+				offset++;
+
+				Mesh::Triangle t = mesh->get_triangle(i);
+				float3 p1 = mesh->verts[t.v[0]];
+				float3 p2 = mesh->verts[t.v[1]];
+				float3 p3 = mesh->verts[t.v[2]];
+
+				if(!transform_applied) {
+					p1 = transform_point(&tfm, p1);
+					p2 = transform_point(&tfm, p2);
+					p3 = transform_point(&tfm, p3);
 				}
+
+				totarea += triangle_area(p1, p2, p3);
 			}
 		}
 
-		if(progress.get_cancel()) return;
-
 		j++;
 	}
 
@@ -443,9 +440,9 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		device->tex_alloc("__light_distribution", dscene->light_distribution);
 
 		/* Portals */
-		if(num_background_lights > 0 && light_index != scene->lights.size()) {
+		if(num_portals > 0) {
 			kintegrator->portal_offset = light_index;
-			kintegrator->num_portals = scene->lights.size() - light_index;
+			kintegrator->num_portals = num_portals;
 			kintegrator->portal_pdf = background_mis? 0.5f: 1.0f;
 		}
 		else {
@@ -609,10 +606,21 @@ void LightManager::device_update_points(Device *device,
                                         Scene *scene)
 {
 	int num_scene_lights = scene->lights.size();
-	if(num_scene_lights == 0)
+
+	int num_lights = 0;
+	foreach(Light *light, scene->lights) {
+		if(light->is_enabled || light->is_portal) {
+			num_lights++;
+		}
+	}
+
+	float4 *light_data = dscene->light_data.resize(num_lights*LIGHT_SIZE);
+
+	if(num_lights == 0) {
+		VLOG(1) << "No effective light, ignoring points update.";
 		return;
+	}
 
-	float4 *light_data = dscene->light_data.resize(num_scene_lights*LIGHT_SIZE);
 	int light_index = 0;
 
 	foreach(Light *light, scene->lights) {
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 2f1df1c9417..745caa96159 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -28,6 +28,7 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
+class Object;
 class Progress;
 class Scene;
 class Shader;
@@ -108,6 +109,9 @@ protected:
 	                              DeviceScene *dscene,
 	                              Scene *scene,
 	                              Progress& progress);
+
+	/* Check whether light manager can use the object as a light-emissive. */
+	bool object_usable_as_light(Object *object);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 8b0ed9f77b2..f90c19a11c8 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -30,14 +30,13 @@
 
 #include "osl_globals.h"
 
+#include "subd_patch_table.h"
+
 #include "util_foreach.h"
 #include "util_logging.h"
 #include "util_progress.h"
 #include "util_set.h"
 
-#include "subd_split.h"
-#include "subd_patch.h"
-
 CCL_NAMESPACE_BEGIN
 
 /* Triangle */
@@ -104,18 +103,23 @@ void Mesh::Curve::bounds_grow(const int k,
 	bounds.grow(upper, mr);
 }
 
+/* SubdFace */
+
+float3 Mesh::SubdFace::normal(const Mesh *mesh) const
+{
+	float3 v0 = mesh->verts[mesh->subd_face_corners[start_corner+0]];
+	float3 v1 = mesh->verts[mesh->subd_face_corners[start_corner+1]];
+	float3 v2 = mesh->verts[mesh->subd_face_corners[start_corner+2]];
+
+	return safe_normalize(cross(v1 - v0, v2 - v0));
+}
+
 /* Mesh */
 
 NODE_DEFINE(Mesh)
 {
 	NodeType* type = NodeType::add("mesh", create);
 
-	static NodeEnum displacement_method_enum;
-	displacement_method_enum.insert("bump", DISPLACE_BUMP);
-	displacement_method_enum.insert("true", DISPLACE_TRUE);
-	displacement_method_enum.insert("both", DISPLACE_BOTH);
-	SOCKET_ENUM(displacement_method, "Displacement Method", displacement_method_enum, DISPLACE_BUMP);
-
 	SOCKET_UINT(motion_steps, "Motion Steps", 3);
 	SOCKET_BOOLEAN(use_motion_blur, "Use Motion Blur", false);
 
@@ -150,18 +154,32 @@ Mesh::Mesh()
 	curve_offset = 0;
 	curvekey_offset = 0;
 
+	patch_offset = 0;
+	face_offset = 0;
+	corner_offset = 0;
+
+	num_subd_verts = 0;
+
 	attributes.triangle_mesh = this;
 	curve_attributes.curve_mesh = this;
+	subd_attributes.subd_mesh = this;
 
 	geometry_flags = GEOMETRY_NONE;
 
 	has_volume = false;
 	has_surface_bssrdf = false;
+
+	num_ngons = 0;
+
+	subdivision_type = SUBDIVISION_NONE;
+
+	patch_table = NULL;
 }
 
 Mesh::~Mesh()
 {
 	delete bvh;
+	delete patch_table;
 }
 
 void Mesh::resize_mesh(int numverts, int numtris)
@@ -171,7 +189,10 @@ void Mesh::resize_mesh(int numverts, int numtris)
 	shader.resize(numtris);
 	smooth.resize(numtris);
 
-	forms_quad.resize(numtris);
+	if(subd_faces.size()) {
+		triangle_patch.resize(numtris);
+		vert_patch_uv.resize(numverts);
+	}
 
 	attributes.resize();
 }
@@ -184,7 +205,10 @@ void Mesh::reserve_mesh(int numverts, int numtris)
 	shader.reserve(numtris);
 	smooth.reserve(numtris);
 
-	forms_quad.reserve(numtris);
+	if(subd_faces.size()) {
+		triangle_patch.reserve(numtris);
+		vert_patch_uv.reserve(numverts);
+	}
 
 	attributes.resize(true);
 }
@@ -209,6 +233,24 @@ void Mesh::reserve_curves(int numcurves, int numkeys)
 	curve_attributes.resize(true);
 }
 
+void Mesh::resize_subd_faces(int numfaces, int num_ngons_, int numcorners)
+{
+	subd_faces.resize(numfaces);
+	subd_face_corners.resize(numcorners);
+	num_ngons = num_ngons_;
+
+	subd_attributes.resize();
+}
+
+void Mesh::reserve_subd_faces(int numfaces, int num_ngons_, int numcorners)
+{
+	subd_faces.reserve(numfaces);
+	subd_face_corners.reserve(numcorners);
+	num_ngons = num_ngons_;
+
+	subd_attributes.resize(true);
+}
+
 void Mesh::clear()
 {
 	/* clear all verts and triangles */
@@ -217,21 +259,33 @@ void Mesh::clear()
 	shader.clear();
 	smooth.clear();
 
-	forms_quad.clear();
+	triangle_patch.clear();
+	vert_patch_uv.clear();
 
 	curve_keys.clear();
 	curve_radius.clear();
 	curve_first_key.clear();
 	curve_shader.clear();
 
+	subd_faces.clear();
+	subd_face_corners.clear();
+
+	num_subd_verts = 0;
+
+	subd_creases.clear();
+
 	attributes.clear();
 	curve_attributes.clear();
+	subd_attributes.clear();
 	used_shaders.clear();
 
 	transform_applied = false;
 	transform_negative_scaled = false;
 	transform_normal = transform_identity();
 	geometry_flags = GEOMETRY_NONE;
+
+	delete patch_table;
+	patch_table = NULL;
 }
 
 int Mesh::split_vertex(int vertex)
@@ -247,27 +301,46 @@ int Mesh::split_vertex(int vertex)
 		}
 	}
 
+	foreach(Attribute& attr, subd_attributes.attributes) {
+		if(attr.element == ATTR_ELEMENT_VERTEX) {
+			vector<char> tmp(attr.data_sizeof());
+			memcpy(&tmp[0], attr.data() + tmp.size()*vertex, tmp.size());
+			attr.add(&tmp[0]);
+		}
+	}
+
 	return verts.size() - 1;
 }
 
 void Mesh::add_vertex(float3 P)
 {
 	verts.push_back_reserved(P);
+
+	if(subd_faces.size()) {
+		vert_patch_uv.push_back_reserved(make_float2(0.0f, 0.0f));
+	}
 }
 
 void Mesh::add_vertex_slow(float3 P)
 {
 	verts.push_back_slow(P);
+
+	if(subd_faces.size()) {
+		vert_patch_uv.push_back_slow(make_float2(0.0f, 0.0f));
+	}
 }
 
-void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_, bool forms_quad_)
+void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_)
 {
 	triangles.push_back_reserved(v0);
 	triangles.push_back_reserved(v1);
 	triangles.push_back_reserved(v2);
 	shader.push_back_reserved(shader_);
 	smooth.push_back_reserved(smooth_);
-	forms_quad.push_back_reserved(forms_quad_);
+
+	if(subd_faces.size()) {
+		triangle_patch.push_back_reserved(-1);
+	}
 }
 
 void Mesh::add_curve_key(float3 co, float radius)
@@ -282,6 +355,25 @@ void Mesh::add_curve(int first_key, int shader)
 	curve_shader.push_back_reserved(shader);
 }
 
+void Mesh::add_subd_face(int* corners, int num_corners, int shader_, bool smooth_)
+{
+	int start_corner = subd_face_corners.size();
+
+	for(int i = 0; i < num_corners; i++) {
+		subd_face_corners.push_back_reserved(corners[i]);
+	}
+
+	int ptex_offset = 0;
+
+	if(subd_faces.size()) {
+		SubdFace& s = subd_faces[subd_faces.size()-1];
+		ptex_offset = s.ptex_offset + s.num_ptex_faces();
+	}
+
+	SubdFace face = {start_corner, num_corners, shader_, smooth_, ptex_offset};
+	subd_faces.push_back_reserved(face);
+}
+
 void Mesh::compute_bounds()
 {
 	BoundBox bnds = BoundBox::empty;
@@ -505,10 +597,23 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 
 void Mesh::pack_verts(const vector<uint>& tri_prim_index,
                       uint4 *tri_vindex,
+                      uint *tri_patch,
+                      float2 *tri_patch_uv,
                       size_t vert_offset,
                       size_t tri_offset)
 {
-	const size_t triangles_size = num_triangles();
+	size_t verts_size = verts.size();
+
+	if(verts_size && subd_faces.size()) {
+		float2 *vert_patch_uv_ptr = &vert_patch_uv[0];
+
+		for(size_t i = 0; i < verts_size; i++) {
+			tri_patch_uv[i] = vert_patch_uv_ptr[i];
+		}
+	}
+
+	size_t triangles_size = num_triangles();
+
 	if(triangles_size) {
 		for(size_t i = 0; i < triangles_size; i++) {
 			Triangle t = get_triangle(i);
@@ -516,6 +621,8 @@ void Mesh::pack_verts(const vector<uint>& tri_prim_index,
 			                           t.v[1] + vert_offset,
 			                           t.v[2] + vert_offset,
 			                           tri_prim_index[i + tri_offset]);
+
+			tri_patch[i] = (!subd_faces.size()) ? -1 : (triangle_patch[i]*8 + patch_offset);
 		}
 	}
 }
@@ -553,6 +660,54 @@ void Mesh::pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, s
 	}
 }
 
+void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset)
+{
+	size_t num_faces = subd_faces.size();
+	int ngons = 0;
+
+	if(num_faces) {
+		for(size_t f = 0; f < num_faces; f++) {
+			SubdFace face = subd_faces[f];
+
+			if(face.is_quad()) {
+				int c[4];
+				memcpy(c, &subd_face_corners[face.start_corner], sizeof(int)*4);
+
+				*(patch_data++) = c[0] + vert_offset;
+				*(patch_data++) = c[1] + vert_offset;
+				*(patch_data++) = c[2] + vert_offset;
+				*(patch_data++) = c[3] + vert_offset;
+
+				*(patch_data++) = f+face_offset;
+				*(patch_data++) = face.num_corners;
+				*(patch_data++) = face.start_corner + corner_offset;
+				*(patch_data++) = 0;
+			}
+			else {
+				for(int i = 0; i < face.num_corners; i++) {
+					int c[4];
+					c[0] = subd_face_corners[face.start_corner + mod(i + 0, face.num_corners)];
+					c[1] = subd_face_corners[face.start_corner + mod(i + 1, face.num_corners)];
+					c[2] = verts.size() - num_subd_verts + ngons;
+					c[3] = subd_face_corners[face.start_corner + mod(i - 1, face.num_corners)];
+
+					*(patch_data++) = c[0] + vert_offset;
+					*(patch_data++) = c[1] + vert_offset;
+					*(patch_data++) = c[2] + vert_offset;
+					*(patch_data++) = c[3] + vert_offset;
+
+					*(patch_data++) = f+face_offset;
+					*(patch_data++) = face.num_corners | (i << 16);
+					*(patch_data++) = face.start_corner + corner_offset;
+					*(patch_data++) = subd_face_corners.size() + ngons + corner_offset;
+				}
+
+				ngons++;
+			}
+		}
+	}
+}
+
 void Mesh::compute_bvh(DeviceScene *dscene,
                        SceneParams *params,
                        Progress *progress,
@@ -626,6 +781,17 @@ bool Mesh::has_motion_blur() const
 	         curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)));
 }
 
+bool Mesh::has_true_displacement() const
+{
+	foreach(Shader *shader, used_shaders) {
+		if(shader->has_displacement && shader->displacement_method != DISPLACE_BUMP) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
 bool Mesh::need_build_bvh() const
 {
 	return !transform_applied || has_surface_bssrdf;
@@ -678,12 +844,14 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 			OSLGlobals::Attribute osl_attr;
 
 			osl_attr.type = attr.type();
-			osl_attr.elem = ATTR_ELEMENT_OBJECT;
+			osl_attr.desc.element = ATTR_ELEMENT_OBJECT;
 			osl_attr.value = attr;
-			osl_attr.offset = 0;
+			osl_attr.desc.offset = 0;
+			osl_attr.desc.flags = 0;
 
-			og->attribute_map[i*ATTR_PRIM_TYPES][attr.name()] = osl_attr;
+			og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][attr.name()] = osl_attr;
 			og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][attr.name()] = osl_attr;
+			og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][attr.name()] = osl_attr;
 		}
 
 		/* find mesh attributes */
@@ -699,9 +867,8 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 		foreach(AttributeRequest& req, attributes.requests) {
 			OSLGlobals::Attribute osl_attr;
 
-			if(req.triangle_element != ATTR_ELEMENT_NONE) {
-				osl_attr.elem = req.triangle_element;
-				osl_attr.offset = req.triangle_offset;
+			if(req.triangle_desc.element != ATTR_ELEMENT_NONE) {
+				osl_attr.desc = req.triangle_desc;
 
 				if(req.triangle_type == TypeDesc::TypeFloat)
 					osl_attr.type = TypeDesc::TypeFloat;
@@ -713,17 +880,16 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 				if(req.std != ATTR_STD_NONE) {
 					/* if standard attribute, add lookup by geom: name convention */
 					ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-					og->attribute_map[i*ATTR_PRIM_TYPES][stdname] = osl_attr;
+					og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][stdname] = osl_attr;
 				}
 				else if(req.name != ustring()) {
 					/* add lookup by mesh attribute name */
-					og->attribute_map[i*ATTR_PRIM_TYPES][req.name] = osl_attr;
+					og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][req.name] = osl_attr;
 				}
 			}
 
-			if(req.curve_element != ATTR_ELEMENT_NONE) {
-				osl_attr.elem = req.curve_element;
-				osl_attr.offset = req.curve_offset;
+			if(req.curve_desc.element != ATTR_ELEMENT_NONE) {
+				osl_attr.desc = req.curve_desc;
 
 				if(req.curve_type == TypeDesc::TypeFloat)
 					osl_attr.type = TypeDesc::TypeFloat;
@@ -742,6 +908,27 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 					og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][req.name] = osl_attr;
 				}
 			}
+
+			if(req.subd_desc.element != ATTR_ELEMENT_NONE) {
+				osl_attr.desc = req.subd_desc;
+
+				if(req.subd_type == TypeDesc::TypeFloat)
+					osl_attr.type = TypeDesc::TypeFloat;
+				else if(req.subd_type == TypeDesc::TypeMatrix)
+					osl_attr.type = TypeDesc::TypeMatrix;
+				else
+					osl_attr.type = TypeDesc::TypeColor;
+
+				if(req.std != ATTR_STD_NONE) {
+					/* if standard attribute, add lookup by geom: name convention */
+					ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
+					og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][stdname] = osl_attr;
+				}
+				else if(req.name != ustring()) {
+					/* add lookup by mesh attribute name */
+					og->attribute_map[i*ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][req.name] = osl_attr;
+				}
+			}
 		}
 	}
 #else
@@ -795,8 +982,8 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 
 			if(mesh->num_triangles()) {
 				attr_map[index].x = id;
-				attr_map[index].y = req.triangle_element;
-				attr_map[index].z = as_uint(req.triangle_offset);
+				attr_map[index].y = req.triangle_desc.element;
+				attr_map[index].z = as_uint(req.triangle_desc.offset);
 
 				if(req.triangle_type == TypeDesc::TypeFloat)
 					attr_map[index].w = NODE_ATTR_FLOAT;
@@ -804,14 +991,16 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 					attr_map[index].w = NODE_ATTR_MATRIX;
 				else
 					attr_map[index].w = NODE_ATTR_FLOAT3;
+
+				attr_map[index].w |= req.triangle_desc.flags << 8;
 			}
 
 			index++;
 
 			if(mesh->num_curves()) {
 				attr_map[index].x = id;
-				attr_map[index].y = req.curve_element;
-				attr_map[index].z = as_uint(req.curve_offset);
+				attr_map[index].y = req.curve_desc.element;
+				attr_map[index].z = as_uint(req.curve_desc.offset);
 
 				if(req.curve_type == TypeDesc::TypeFloat)
 					attr_map[index].w = NODE_ATTR_FLOAT;
@@ -819,25 +1008,39 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 					attr_map[index].w = NODE_ATTR_MATRIX;
 				else
 					attr_map[index].w = NODE_ATTR_FLOAT3;
+
+				attr_map[index].w |= req.curve_desc.flags << 8;
 			}
 
 			index++;
-		}
 
-		/* terminator */
-		attr_map[index].x = ATTR_STD_NONE;
-		attr_map[index].y = 0;
-		attr_map[index].z = 0;
-		attr_map[index].w = 0;
+			if(mesh->subd_faces.size()) {
+				attr_map[index].x = id;
+				attr_map[index].y = req.subd_desc.element;
+				attr_map[index].z = as_uint(req.subd_desc.offset);
+
+				if(req.subd_type == TypeDesc::TypeFloat)
+					attr_map[index].w = NODE_ATTR_FLOAT;
+				else if(req.subd_type == TypeDesc::TypeMatrix)
+					attr_map[index].w = NODE_ATTR_MATRIX;
+				else
+					attr_map[index].w = NODE_ATTR_FLOAT3;
 
-		index++;
+				attr_map[index].w |= req.subd_desc.flags << 8;
+			}
+
+			index++;
+		}
 
-		attr_map[index].x = ATTR_STD_NONE;
-		attr_map[index].y = 0;
-		attr_map[index].z = 0;
-		attr_map[index].w = 0;
+		/* terminator */
+		for(int i = 0; i < ATTR_PRIM_TYPES; i++) {
+			attr_map[index].x = ATTR_STD_NONE;
+			attr_map[index].y = 0;
+			attr_map[index].z = 0;
+			attr_map[index].w = 0;
 
-		index++;
+			index++;
+		}
 	}
 
 	/* copy to device */
@@ -847,17 +1050,13 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 
 static void update_attribute_element_size(Mesh *mesh,
                                           Attribute *mattr,
+                                          AttributePrimitive prim,
                                           size_t *attr_float_size,
                                           size_t *attr_float3_size,
                                           size_t *attr_uchar4_size)
 {
 	if(mattr) {
-		size_t size = mattr->element_size(
-			mesh->verts.size(),
-			mesh->num_triangles(),
-			mesh->motion_steps,
-			mesh->num_curves(),
-			mesh->curve_keys.size());
+		size_t size = mattr->element_size(mesh, prim);
 
 		if(mattr->element == ATTR_ELEMENT_VOXEL) {
 			/* pass */
@@ -885,22 +1084,21 @@ static void update_attribute_element_offset(Mesh *mesh,
                                             vector<uchar4>& attr_uchar4,
                                             size_t& attr_uchar4_offset,
                                             Attribute *mattr,
+                                            AttributePrimitive prim,
                                             TypeDesc& type,
-                                            int& offset,
-                                            AttributeElement& element)
+                                            AttributeDescriptor& desc)
 {
 	if(mattr) {
 		/* store element and type */
-		element = mattr->element;
+		desc.element = mattr->element;
+		desc.flags = mattr->flags;
 		type = mattr->type;
 
 		/* store attribute data in arrays */
-		size_t size = mattr->element_size(
-			mesh->verts.size(),
-			mesh->num_triangles(),
-			mesh->motion_steps,
-			mesh->num_curves(),
-			mesh->curve_keys.size());
+		size_t size = mattr->element_size(mesh, prim);
+
+		AttributeElement& element = desc.element;
+		int& offset = desc.offset;
 
 		if(mattr->element == ATTR_ELEMENT_VOXEL) {
 			/* store slot in offset value */
@@ -950,14 +1148,26 @@ static void update_attribute_element_offset(Mesh *mesh,
 
 		/* mesh vertex/curve index is global, not per object, so we sneak
 		 * a correction for that in here */
-		if(element == ATTR_ELEMENT_VERTEX)
+		if(mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK && desc.flags & ATTR_SUBDIVIDED) {
+			/* indices for subdivided attributes are retrieved
+			 * from patch table so no need for correction here*/
+		}
+		else if(element == ATTR_ELEMENT_VERTEX)
 			offset -= mesh->vert_offset;
 		else if(element == ATTR_ELEMENT_VERTEX_MOTION)
 			offset -= mesh->vert_offset;
-		else if(element == ATTR_ELEMENT_FACE)
-			offset -= mesh->tri_offset;
-		else if(element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE)
-			offset -= 3*mesh->tri_offset;
+		else if(element == ATTR_ELEMENT_FACE) {
+			if(prim == ATTR_PRIM_TRIANGLE)
+				offset -= mesh->tri_offset;
+			else
+				offset -= mesh->face_offset;
+		}
+		else if(element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
+			if(prim == ATTR_PRIM_TRIANGLE)
+				offset -= 3*mesh->tri_offset;
+			else
+				offset -= mesh->corner_offset;
+		}
 		else if(element == ATTR_ELEMENT_CURVE)
 			offset -= mesh->curve_offset;
 		else if(element == ATTR_ELEMENT_CURVE_KEY)
@@ -967,8 +1177,8 @@ static void update_attribute_element_offset(Mesh *mesh,
 	}
 	else {
 		/* attribute not found */
-		element = ATTR_ELEMENT_NONE;
-		offset = 0;
+		desc.element = ATTR_ELEMENT_NONE;
+		desc.offset = 0;
 	}
 }
 
@@ -1007,23 +1217,23 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 		foreach(AttributeRequest& req, attributes.requests) {
 			Attribute *triangle_mattr = mesh->attributes.find(req);
 			Attribute *curve_mattr = mesh->curve_attributes.find(req);
-
-			/* todo: get rid of this exception, it's only here for giving some
-			 * working texture coordinate for subdivision as we can't preserve
-			 * any attributes yet */
-			if(!triangle_mattr && req.std == ATTR_STD_GENERATED) {
-				triangle_mattr = mesh->attributes.add(ATTR_STD_GENERATED);
-				if(mesh->verts.size())
-					memcpy(triangle_mattr->data_float3(), &mesh->verts[0], sizeof(float3)*mesh->verts.size());
-			}
+			Attribute *subd_mattr = mesh->subd_attributes.find(req);
 
 			update_attribute_element_size(mesh,
 			                              triangle_mattr,
+			                              ATTR_PRIM_TRIANGLE,
 			                              &attr_float_size,
 			                              &attr_float3_size,
 			                              &attr_uchar4_size);
 			update_attribute_element_size(mesh,
 			                              curve_mattr,
+			                              ATTR_PRIM_CURVE,
+			                              &attr_float_size,
+			                              &attr_float3_size,
+			                              &attr_uchar4_size);
+			update_attribute_element_size(mesh,
+			                              subd_mattr,
+			                              ATTR_PRIM_SUBD,
 			                              &attr_float_size,
 			                              &attr_float3_size,
 			                              &attr_uchar4_size);
@@ -1048,24 +1258,34 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 		foreach(AttributeRequest& req, attributes.requests) {
 			Attribute *triangle_mattr = mesh->attributes.find(req);
 			Attribute *curve_mattr = mesh->curve_attributes.find(req);
+			Attribute *subd_mattr = mesh->subd_attributes.find(req);
 
 			update_attribute_element_offset(mesh,
 			                                attr_float, attr_float_offset,
 			                                attr_float3, attr_float3_offset,
 			                                attr_uchar4, attr_uchar4_offset,
 			                                triangle_mattr,
+			                                ATTR_PRIM_TRIANGLE,
 			                                req.triangle_type,
-			                                req.triangle_offset,
-			                                req.triangle_element);
+			                                req.triangle_desc);
 
 			update_attribute_element_offset(mesh,
 			                                attr_float, attr_float_offset,
 			                                attr_float3, attr_float3_offset,
 			                                attr_uchar4, attr_uchar4_offset,
 			                                curve_mattr,
+			                                ATTR_PRIM_CURVE,
 			                                req.curve_type,
-			                                req.curve_offset,
-			                                req.curve_element);
+			                                req.curve_desc);
+
+			update_attribute_element_offset(mesh,
+			                                attr_float, attr_float_offset,
+			                                attr_float3, attr_float3_offset,
+			                                attr_uchar4, attr_uchar4_offset,
+			                                subd_mattr,
+			                                ATTR_PRIM_SUBD,
+			                                req.subd_type,
+			                                req.subd_desc);
 
 			if(progress.get_cancel()) return;
 		}
@@ -1100,19 +1320,43 @@ void MeshManager::mesh_calc_offset(Scene *scene)
 {
 	size_t vert_size = 0;
 	size_t tri_size = 0;
+
 	size_t curve_key_size = 0;
 	size_t curve_size = 0;
 
+	size_t patch_size = 0;
+	size_t face_size = 0;
+	size_t corner_size = 0;
+
 	foreach(Mesh *mesh, scene->meshes) {
 		mesh->vert_offset = vert_size;
 		mesh->tri_offset = tri_size;
+
 		mesh->curvekey_offset = curve_key_size;
 		mesh->curve_offset = curve_size;
 
+		mesh->patch_offset = patch_size;
+		mesh->face_offset = face_size;
+		mesh->corner_offset = corner_size;
+
 		vert_size += mesh->verts.size();
 		tri_size += mesh->num_triangles();
+
 		curve_key_size += mesh->curve_keys.size();
 		curve_size += mesh->num_curves();
+
+		if(mesh->subd_faces.size()) {
+			Mesh::SubdFace& last = mesh->subd_faces[mesh->subd_faces.size()-1];
+			patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+			/* patch tables are stored in same array so include them in patch_size */
+			if(mesh->patch_table) {
+				mesh->patch_table_offset = patch_size;
+				patch_size += mesh->patch_table->total_size();
+			}
+		}
+		face_size += mesh->subd_faces.size();
+		corner_size += mesh->subd_face_corners.size();
 	}
 }
 
@@ -1125,14 +1369,31 @@ void MeshManager::device_update_mesh(Device *device,
 	/* Count. */
 	size_t vert_size = 0;
 	size_t tri_size = 0;
+
 	size_t curve_key_size = 0;
 	size_t curve_size = 0;
+
+	size_t patch_size = 0;
+
 	foreach(Mesh *mesh, scene->meshes) {
 		vert_size += mesh->verts.size();
 		tri_size += mesh->num_triangles();
+
 		curve_key_size += mesh->curve_keys.size();
 		curve_size += mesh->num_curves();
+
+		if(mesh->subd_faces.size()) {
+			Mesh::SubdFace& last = mesh->subd_faces[mesh->subd_faces.size()-1];
+			patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+			/* patch tables are stored in same array so include them in patch_size */
+			if(mesh->patch_table) {
+				mesh->patch_table_offset = patch_size;
+				patch_size += mesh->patch_table->total_size();
+			}
+		}
 	}
+
 	/* Create mapping from triangle to primitive triangle array. */
 	vector<uint> tri_prim_index(tri_size);
 	if(for_displacement) {
@@ -1155,6 +1416,7 @@ void MeshManager::device_update_mesh(Device *device,
 			}
 		}
 	}
+
 	/* Fill in all the arrays. */
 	if(tri_size != 0) {
 		/* normals */
@@ -1163,6 +1425,8 @@ void MeshManager::device_update_mesh(Device *device,
 		uint *tri_shader = dscene->tri_shader.resize(tri_size);
 		float4 *vnormal = dscene->tri_vnormal.resize(vert_size);
 		uint4 *tri_vindex = dscene->tri_vindex.resize(tri_size);
+		uint *tri_patch = dscene->tri_patch.resize(tri_size);
+		float2 *tri_patch_uv = dscene->tri_patch_uv.resize(vert_size);
 
 		foreach(Mesh *mesh, scene->meshes) {
 			mesh->pack_normals(scene,
@@ -1170,6 +1434,8 @@ void MeshManager::device_update_mesh(Device *device,
 			                   &vnormal[mesh->vert_offset]);
 			mesh->pack_verts(tri_prim_index,
 			                 &tri_vindex[mesh->tri_offset],
+			                 &tri_patch[mesh->tri_offset],
+			                 &tri_patch_uv[mesh->vert_offset],
 			                 mesh->vert_offset,
 			                 mesh->tri_offset);
 			if(progress.get_cancel()) return;
@@ -1181,7 +1447,10 @@ void MeshManager::device_update_mesh(Device *device,
 		device->tex_alloc("__tri_shader", dscene->tri_shader);
 		device->tex_alloc("__tri_vnormal", dscene->tri_vnormal);
 		device->tex_alloc("__tri_vindex", dscene->tri_vindex);
+		device->tex_alloc("__tri_patch", dscene->tri_patch);
+		device->tex_alloc("__tri_patch_uv", dscene->tri_patch_uv);
 	}
+
 	if(curve_size != 0) {
 		progress.set_status("Updating Mesh", "Copying Strands to device");
 
@@ -1196,6 +1465,25 @@ void MeshManager::device_update_mesh(Device *device,
 		device->tex_alloc("__curve_keys", dscene->curve_keys);
 		device->tex_alloc("__curves", dscene->curves);
 	}
+
+	if(patch_size != 0) {
+		progress.set_status("Updating Mesh", "Copying Patches to device");
+
+		uint *patch_data = dscene->patches.resize(patch_size);
+
+		foreach(Mesh *mesh, scene->meshes) {
+			mesh->pack_patches(&patch_data[mesh->patch_offset], mesh->vert_offset, mesh->face_offset, mesh->corner_offset);
+
+			if(mesh->patch_table) {
+				mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset], mesh->patch_table_offset);
+			}
+
+			if(progress.get_cancel()) return;
+		}
+
+		device->tex_alloc("__patches", dscene->patches);
+	}
+
 	if(for_displacement) {
 		float4 *prim_tri_verts = dscene->prim_tri_verts.resize(tri_size * 3);
 		foreach(Mesh *mesh, scene->meshes) {
@@ -1376,7 +1664,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	bool old_need_object_flags_update = false;
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update &&
-		   mesh->displacement_method != Mesh::DISPLACE_BUMP)
+		   mesh->has_true_displacement())
 		{
 			true_displacement_used = true;
 			break;
@@ -1402,6 +1690,10 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	}
 	if(progress.get_cancel()) return;
 
+	/* after mesh data has been copied to device memory we need to update
+	 * offsets for patch tables as this can't be known before hand */
+	scene->object_manager->device_update_patch_map_offsets(device, dscene, scene);
+
 	device_update_attributes(device, dscene, scene, progress);
 	if(progress.get_cancel()) return;
 
@@ -1433,7 +1725,9 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 			num_bvh++;
 		}
 	}
+
 	TaskPool pool;
+
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update) {
 			pool.push(function_bind(&Mesh::compute_bvh,
@@ -1448,6 +1742,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 			}
 		}
 	}
+
 	TaskPool::Summary summary;
 	pool.wait_work(&summary);
 	VLOG(2) << "Objects BVH build pool statistics:\n"
@@ -1505,8 +1800,11 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->tri_shader);
 	device->tex_free(dscene->tri_vnormal);
 	device->tex_free(dscene->tri_vindex);
+	device->tex_free(dscene->tri_patch);
+	device->tex_free(dscene->tri_patch_uv);
 	device->tex_free(dscene->curves);
 	device->tex_free(dscene->curve_keys);
+	device->tex_free(dscene->patches);
 	device->tex_free(dscene->attributes_map);
 	device->tex_free(dscene->attributes_float);
 	device->tex_free(dscene->attributes_float3);
@@ -1523,8 +1821,11 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->tri_shader.clear();
 	dscene->tri_vnormal.clear();
 	dscene->tri_vindex.clear();
+	dscene->tri_patch.clear();
+	dscene->tri_patch_uv.clear();
 	dscene->curves.clear();
 	dscene->curve_keys.clear();
+	dscene->patches.clear();
 	dscene->attributes_map.clear();
 	dscene->attributes_float.clear();
 	dscene->attributes_float3.clear();
@@ -1574,77 +1875,5 @@ bool Mesh::need_attribute(Scene * /*scene*/, ustring name)
 	return false;
 }
 
-void Mesh::tessellate(DiagSplit *split)
-{
-	int num_faces = num_triangles();
-
-	add_face_normals();
-	add_vertex_normals();
-
-	Attribute *attr_fN = attributes.find(ATTR_STD_FACE_NORMAL);
-	float3 *fN = attr_fN->data_float3();
-
-	Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
-	float3 *vN = attr_vN->data_float3();
-
-	for(int f = 0; f < num_faces; f++) {
-		if(!forms_quad[f]) {
-			/* triangle */
-			LinearTrianglePatch patch;
-			Triangle triangle = get_triangle(f);
-			float3 *hull = patch.hull;
-			float3 *normals = patch.normals;
-
-			for(int i = 0; i < 3; i++) {
-				hull[i] = verts[triangle.v[i]];
-			}
-
-			if(smooth[f]) {
-				for(int i = 0; i < 3; i++) {
-					normals[i] = vN[triangle.v[i]];
-				}
-			}
-			else {
-				for(int i = 0; i < 3; i++) {
-					normals[i] = fN[f];
-				}
-			}
-
-			split->split_triangle(&patch);
-		}
-		else {
-			/* quad */
-			LinearQuadPatch patch;
-			Triangle triangle0 = get_triangle(f);
-			Triangle triangle1 = get_triangle(f+1);
-			float3 *hull = patch.hull;
-			float3 *normals = patch.normals;
-
-			hull[0] = verts[triangle0.v[0]];
-			hull[1] = verts[triangle0.v[1]];
-			hull[3] = verts[triangle0.v[2]];
-			hull[2] = verts[triangle1.v[2]];
-
-			if(smooth[f]) {
-				normals[0] = vN[triangle0.v[0]];
-				normals[1] = vN[triangle0.v[1]];
-				normals[3] = vN[triangle0.v[2]];
-				normals[2] = vN[triangle1.v[2]];
-			}
-			else {
-				for(int i = 0; i < 4; i++) {
-					normals[i] = fN[f];
-				}
-			}
-
-			split->split_quad(&patch);
-
-			// consume second triangle in quad
-			f++;
-		}
-
-	}
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 0aea55544f2..eff5c50e635 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -40,6 +40,7 @@ class Scene;
 class SceneParams;
 class AttributeRequest;
 class DiagSplit;
+struct PackedPatchTable;
 
 /* Mesh */
 
@@ -97,15 +98,32 @@ public:
 		return curve_first_key.size();
 	}
 
-	/* Displacement */
-	enum DisplacementMethod {
-		DISPLACE_BUMP = 0,
-		DISPLACE_TRUE = 1,
-		DISPLACE_BOTH = 2,
+	/* Mesh SubdFace */
+	struct SubdFace {
+		int start_corner;
+		int num_corners;
+		int shader;
+		bool smooth;
+		int ptex_offset;
+
+		bool is_quad() { return num_corners == 4; }
+		float3 normal(const Mesh *mesh) const;
+		int num_ptex_faces() const { return num_corners == 4 ? 1 : num_corners; }
+	};
+
+	struct SubdEdgeCrease {
+		int v[2];
+		float crease;
+	};
 
-		DISPLACE_NUM_METHODS,
+	enum SubdivisionType {
+		SUBDIVISION_NONE,
+		SUBDIVISION_LINEAR,
+		SUBDIVISION_CATMULL_CLARK,
 	};
 
+	SubdivisionType subdivision_type;
+
 	/* Mesh Data */
 	enum GeometryFlags {
 		GEOMETRY_NONE      = 0,
@@ -119,7 +137,10 @@ public:
 	array<float3> verts;
 	array<int> shader;
 	array<bool> smooth;
-	array<bool> forms_quad; /* used to tell if triangle is part of a quad patch */
+
+	/* used for storing patch info for subd triangles, only allocated if there are patches */
+	array<int> triangle_patch; /* must be < 0 for non subd triangles */
+	array<float2> vert_patch_uv;
 
 	bool has_volume;  /* Set in the device_update_flags(). */
 	bool has_surface_bssrdf;  /* Set in the device_update_flags(). */
@@ -129,15 +150,23 @@ public:
 	array<int> curve_first_key;
 	array<int> curve_shader;
 
+	array<SubdFace> subd_faces;
+	array<int> subd_face_corners;
+	int num_ngons;
+
+	array<SubdEdgeCrease> subd_creases;
+
 	vector<Shader*> used_shaders;
 	AttributeSet attributes;
 	AttributeSet curve_attributes;
+	AttributeSet subd_attributes;
 
 	BoundBox bounds;
 	bool transform_applied;
 	bool transform_negative_scaled;
 	Transform transform_normal;
-	DisplacementMethod displacement_method;
+
+	PackedPatchTable *patch_table;
 
 	uint motion_steps;
 	bool use_motion_blur;
@@ -154,6 +183,13 @@ public:
 	size_t curve_offset;
 	size_t curvekey_offset;
 
+	size_t patch_offset;
+	size_t patch_table_offset;
+	size_t face_offset;
+	size_t corner_offset;
+
+	size_t num_subd_verts;
+
 	/* Functions */
 	Mesh();
 	~Mesh();
@@ -162,12 +198,15 @@ public:
 	void reserve_mesh(int numverts, int numfaces);
 	void resize_curves(int numcurves, int numkeys);
 	void reserve_curves(int numcurves, int numkeys);
+	void resize_subd_faces(int numfaces, int num_ngons, int numcorners);
+	void reserve_subd_faces(int numfaces, int num_ngons, int numcorners);
 	void clear();
 	void add_vertex(float3 P);
 	void add_vertex_slow(float3 P);
-	void add_triangle(int v0, int v1, int v2, int shader, bool smooth, bool forms_quad = false);
+	void add_triangle(int v0, int v1, int v2, int shader, bool smooth);
 	void add_curve_key(float3 loc, float radius);
 	void add_curve(int first_key, int shader);
+	void add_subd_face(int* corners, int num_corners, int shader_, bool smooth_);
 	int split_vertex(int vertex);
 
 	void compute_bounds();
@@ -177,9 +216,13 @@ public:
 	void pack_normals(Scene *scene, uint *shader, float4 *vnormal);
 	void pack_verts(const vector<uint>& tri_prim_index,
 	                uint4 *tri_vindex,
+	                uint *tri_patch,
+	                float2 *tri_patch_uv,
 	                size_t vert_offset,
 	                size_t tri_offset);
 	void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
+	void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset);
+
 	void compute_bvh(DeviceScene *dscene,
 	                 SceneParams *params,
 	                 Progress *progress,
@@ -192,6 +235,7 @@ public:
 	void tag_update(Scene *scene, bool rebuild);
 
 	bool has_motion_blur() const;
+	bool has_true_displacement() const;
 
 	/* Check whether the mesh should have own BVH built separately. Briefly,
 	 * own BVH is needed for mesh, if:
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 95f46ff02a2..ef9cfedd412 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -26,19 +26,27 @@
 
 CCL_NAMESPACE_BEGIN
 
+static float3 compute_face_normal(const Mesh::Triangle& t, float3 *verts)
+{
+	float3 v0 = verts[t.v[0]];
+	float3 v1 = verts[t.v[1]];
+	float3 v2 = verts[t.v[2]];
+
+	float3 norm = cross(v1 - v0, v2 - v0);
+	float normlen = len(norm);
+
+	if(normlen == 0.0f)
+		return make_float3(1.0f, 0.0f, 0.0f);
+
+	return norm / normlen;
+}
+
 bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress& progress)
 {
 	/* verify if we have a displacement shader */
-	bool has_displacement = false;
-
-	if(mesh->displacement_method != Mesh::DISPLACE_BUMP) {
-		foreach(Shader *shader, mesh->used_shaders)
-			if(shader->has_displacement)
-				has_displacement = true;
-	}
-	
-	if(!has_displacement)
+	if(!mesh->has_true_displacement()) {
 		return false;
+	}
 
 	string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
 	progress.set_status("Updating Mesh", msg);
@@ -67,8 +75,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 		Shader *shader = (shader_index < mesh->used_shaders.size()) ?
 			mesh->used_shaders[shader_index] : scene->default_surface;
 
-		if(!shader->has_displacement)
+		if(!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
 			continue;
+		}
 
 		for(int j = 0; j < 3; j++) {
 			if(done[t.v[j]])
@@ -153,8 +162,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 		Shader *shader = (shader_index < mesh->used_shaders.size()) ?
 			mesh->used_shaders[shader_index] : scene->default_surface;
 
-		if(!shader->has_displacement)
+		if(!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
 			continue;
+		}
 
 		for(int j = 0; j < 3; j++) {
 			if(!done[t.v[j]]) {
@@ -178,9 +188,131 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
 
-	if(mesh->displacement_method == Mesh::DISPLACE_TRUE) {
-		mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
-		mesh->add_vertex_normals();
+	bool need_recompute_vertex_normals = false;
+
+	foreach(Shader *shader, mesh->used_shaders) {
+		if(shader->has_displacement && shader->displacement_method == DISPLACE_TRUE) {
+			need_recompute_vertex_normals = true;
+			break;
+		}
+	}
+
+	if(need_recompute_vertex_normals) {
+		bool flip = mesh->transform_negative_scaled;
+		vector<bool> tri_has_true_disp(num_triangles, false);
+
+		for(size_t i = 0; i < num_triangles; i++) {
+			int shader_index = mesh->shader[i];
+			Shader *shader = (shader_index < mesh->used_shaders.size()) ?
+				mesh->used_shaders[shader_index] : scene->default_surface;
+
+			tri_has_true_disp[i] = shader->has_displacement && shader->displacement_method == DISPLACE_TRUE;
+		}
+
+		/* static vertex normals */
+
+		/* get attributes */
+		Attribute *attr_fN = mesh->attributes.find(ATTR_STD_FACE_NORMAL);
+		Attribute *attr_vN = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
+
+		float3 *fN = attr_fN->data_float3();
+		float3 *vN = attr_vN->data_float3();
+
+		/* compute vertex normals */
+
+		/* zero vertex normals on triangles with true displacement */
+		for(size_t i = 0; i < num_triangles; i++) {
+			if(tri_has_true_disp[i]) {
+				for(size_t j = 0; j < 3; j++) {
+					vN[mesh->get_triangle(i).v[j]] = make_float3(0.0f, 0.0f, 0.0f);
+				}
+			}
+		}
+
+		/* add face normals to vertex normals */
+		for(size_t i = 0; i < num_triangles; i++) {
+			if(tri_has_true_disp[i]) {
+				for(size_t j = 0; j < 3; j++) {
+					vN[mesh->get_triangle(i).v[j]] += fN[i];
+				}
+			}
+		}
+
+		/* normalize vertex normals */
+		done.clear();
+		done.resize(num_verts, false);
+
+		for(size_t i = 0; i < num_triangles; i++) {
+			if(tri_has_true_disp[i]) {
+				for(size_t j = 0; j < 3; j++) {
+					int vert = mesh->get_triangle(i).v[j];
+
+					if(done[vert]) {
+						continue;
+					}
+
+					vN[vert] = normalize(vN[vert]);
+					if(flip)
+						vN[vert] = -vN[vert];
+
+					done[vert] = true;
+				}
+			}
+		}
+
+		/* motion vertex normals */
+		Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+
+		if(mesh->has_motion_blur() && attr_mP && attr_mN) {
+			for(int step = 0; step < mesh->motion_steps - 1; step++) {
+				float3 *mP = attr_mP->data_float3() + step*mesh->verts.size();
+				float3 *mN = attr_mN->data_float3() + step*mesh->verts.size();
+
+				/* compute */
+
+				/* zero vertex normals on triangles with true displacement */
+				for(size_t i = 0; i < num_triangles; i++) {
+					if(tri_has_true_disp[i]) {
+						for(size_t j = 0; j < 3; j++) {
+							mN[mesh->get_triangle(i).v[j]] = make_float3(0.0f, 0.0f, 0.0f);
+						}
+					}
+				}
+
+				/* add face normals to vertex normals */
+				for(size_t i = 0; i < num_triangles; i++) {
+					if(tri_has_true_disp[i]) {
+						for(size_t j = 0; j < 3; j++) {
+							float3 fN = compute_face_normal(mesh->get_triangle(i), mP);
+							mN[mesh->get_triangle(i).v[j]] += fN;
+						}
+					}
+				}
+
+				/* normalize vertex normals */
+				done.clear();
+				done.resize(num_verts, false);
+
+				for(size_t i = 0; i < num_triangles; i++) {
+					if(tri_has_true_disp[i]) {
+						for(size_t j = 0; j < 3; j++) {
+							int vert = mesh->get_triangle(i).v[j];
+
+							if(done[vert]) {
+								continue;
+							}
+
+							mN[vert] = normalize(mN[vert]);
+							if(flip)
+								mN[vert] = -mN[vert];
+
+							done[vert] = true;
+						}
+					}
+				}
+			}
+		}
 	}
 
 	return true;
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
new file mode 100644
index 00000000000..efb40efbb79
--- /dev/null
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -0,0 +1,567 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mesh.h"
+#include "attribute.h"
+
+#include "subd_split.h"
+#include "subd_patch.h"
+#include "subd_patch_table.h"
+
+#include "util_foreach.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_OPENSUBDIV
+
+CCL_NAMESPACE_END
+
+#include <opensubdiv/far/topologyRefinerFactory.h>
+#include <opensubdiv/far/primvarRefiner.h>
+#include <opensubdiv/far/patchTableFactory.h>
+#include <opensubdiv/far/patchMap.h>
+
+/* specializations of TopologyRefinerFactory for ccl::Mesh */
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+namespace Far {
+	template<>
+	bool TopologyRefinerFactory<ccl::Mesh>::resizeComponentTopology(TopologyRefiner& refiner, ccl::Mesh const& mesh)
+	{
+		setNumBaseVertices(refiner, mesh.verts.size());
+		setNumBaseFaces(refiner, mesh.subd_faces.size());
+
+		ccl::Mesh::SubdFace* face = &mesh.subd_faces[0];
+
+		for(int i = 0; i < mesh.subd_faces.size(); i++, face++) {
+			setNumBaseFaceVertices(refiner, i, face->num_corners);
+		}
+
+		return true;
+	}
+
+	template<>
+	bool TopologyRefinerFactory<ccl::Mesh>::assignComponentTopology(TopologyRefiner& refiner, ccl::Mesh const& mesh)
+	{
+		ccl::Mesh::SubdFace* face = &mesh.subd_faces[0];
+
+		for(int i = 0; i < mesh.subd_faces.size(); i++, face++) {
+			IndexArray face_verts = getBaseFaceVertices(refiner, i);
+
+			int* corner = &mesh.subd_face_corners[face->start_corner];
+
+			for(int j = 0; j < face->num_corners; j++, corner++) {
+				face_verts[j] = *corner;
+			}
+		}
+
+		return true;
+	}
+
+	template<>
+	bool TopologyRefinerFactory<ccl::Mesh>::assignComponentTags(TopologyRefiner& refiner, ccl::Mesh const& mesh)
+	{
+		const ccl::Mesh::SubdEdgeCrease* crease = mesh.subd_creases.data();
+
+		for(int i = 0; i < mesh.subd_creases.size(); i++, crease++) {
+			Index edge = findBaseEdge(refiner, crease->v[0], crease->v[1]);
+
+			if(edge != INDEX_INVALID) {
+				setBaseEdgeSharpness(refiner, edge, crease->crease * 10.0f);
+			}
+		}
+
+		for(int i = 0; i < mesh.verts.size(); i++) {
+			ConstIndexArray vert_edges = getBaseVertexEdges(refiner, i);
+
+			if(vert_edges.size() == 2) {
+				float sharpness = refiner.getLevel(0).getEdgeSharpness(vert_edges[0]);
+				sharpness = std::min(sharpness, refiner.getLevel(0).getEdgeSharpness(vert_edges[1]));
+
+				setBaseVertexSharpness(refiner, i, sharpness);
+			}
+		}
+
+		return true;
+	}
+
+	template<>
+	bool TopologyRefinerFactory<ccl::Mesh>::assignFaceVaryingTopology(TopologyRefiner& /*refiner*/, ccl::Mesh const& /*mesh*/)
+	{
+		return true;
+	}
+
+	template<>
+	void TopologyRefinerFactory<ccl::Mesh>::reportInvalidTopology(TopologyError /*err_code*/,
+		char const */*msg*/, ccl::Mesh const& /*mesh*/)
+	{
+	}
+} /* namespace Far */
+} /* namespace OPENSUBDIV_VERSION */
+} /* namespace OpenSubdiv */
+
+CCL_NAMESPACE_BEGIN
+
+using namespace OpenSubdiv;
+
+/* struct that implements OpenSubdiv's vertex interface */
+
+template<typename T>
+struct OsdValue {
+	T value;
+
+	OsdValue() {}
+
+	void Clear(void* = 0) {
+		memset(&value, 0, sizeof(T));
+	}
+
+	void AddWithWeight(OsdValue<T> const& src, float weight) {
+		value += src.value * weight;
+	}
+};
+
+template<>
+void OsdValue<uchar4>::AddWithWeight(OsdValue<uchar4> const& src, float weight)
+{
+	for(int i = 0; i < 4; i++) {
+		value[i] += (uchar)(src.value[i] * weight);
+	}
+}
+
+/* class for holding OpenSubdiv data used during tessellation */
+
+class OsdData {
+	Mesh* mesh;
+	vector<OsdValue<float3> > verts;
+	Far::TopologyRefiner* refiner;
+	Far::PatchTable* patch_table;
+	Far::PatchMap* patch_map;
+
+public:
+	OsdData() : mesh(NULL), refiner(NULL), patch_table(NULL), patch_map(NULL) {}
+
+	~OsdData()
+	{
+		delete refiner;
+		delete patch_table;
+		delete patch_map;
+	}
+
+	void build_from_mesh(Mesh* mesh_)
+	{
+		mesh = mesh_;
+
+		/* type and options */
+		Sdc::SchemeType type = Sdc::SCHEME_CATMARK;
+
+		Sdc::Options options;
+		options.SetVtxBoundaryInterpolation(Sdc::Options::VTX_BOUNDARY_EDGE_ONLY);
+
+		/* create refiner */
+		refiner = Far::TopologyRefinerFactory<Mesh>::Create(*mesh,
+				Far::TopologyRefinerFactory<Mesh>::Options(type, options));
+
+		/* adaptive refinement */
+		int max_isolation = 10;
+		refiner->RefineAdaptive(Far::TopologyRefiner::AdaptiveOptions(max_isolation));
+
+		/* create patch table */
+		Far::PatchTableFactory::Options patch_options;
+		patch_options.endCapType = Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS;
+
+		patch_table = Far::PatchTableFactory::Create(*refiner, patch_options);
+
+		/* interpolate verts */
+		int num_refiner_verts = refiner->GetNumVerticesTotal();
+		int num_local_points = patch_table->GetNumLocalPoints();
+
+		verts.resize(num_refiner_verts + num_local_points);
+		for(int i = 0; i < mesh->verts.size(); i++) {
+			verts[i].value = mesh->verts[i];
+		}
+
+		OsdValue<float3>* src = &verts[0];
+		for(int i = 0; i < refiner->GetMaxLevel(); i++) {
+			OsdValue<float3>* dest = src + refiner->GetLevel(i).GetNumVertices();
+			Far::PrimvarRefiner(*refiner).Interpolate(i+1, src, dest);
+			src = dest;
+		}
+
+		patch_table->ComputeLocalPointValues(&verts[0], &verts[num_refiner_verts]);
+
+		/* create patch map */
+		patch_map = new Far::PatchMap(*patch_table);
+	}
+
+	void subdivide_attribute(Attribute& attr)
+	{
+		Far::PrimvarRefiner primvar_refiner(*refiner);
+
+		if(attr.element == ATTR_ELEMENT_VERTEX) {
+			int num_refiner_verts = refiner->GetNumVerticesTotal();
+			int num_local_points = patch_table->GetNumLocalPoints();
+
+			attr.resize(num_refiner_verts + num_local_points);
+			attr.flags |= ATTR_FINAL_SIZE;
+
+			char* src = &attr.buffer[0];
+
+			for(int i = 0; i < refiner->GetMaxLevel(); i++) {
+				char* dest = src + refiner->GetLevel(i).GetNumVertices() * attr.data_sizeof();
+
+				if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) {
+					primvar_refiner.Interpolate(i+1, (OsdValue<float>*)src, (OsdValue<float>*&)dest);
+				}
+				else {
+					primvar_refiner.Interpolate(i+1, (OsdValue<float4>*)src, (OsdValue<float4>*&)dest);
+				}
+
+				src = dest;
+			}
+
+			if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) {
+				patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0],
+					                                 (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+			}
+			else {
+				patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0],
+					                                 (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+			}
+		}
+		else if(attr.element == ATTR_ELEMENT_CORNER || attr.element == ATTR_ELEMENT_CORNER_BYTE) {
+			// TODO(mai): fvar interpolation
+		}
+	}
+
+	friend struct OsdPatch;
+	friend class Mesh;
+};
+
+/* ccl::Patch implementation that uses OpenSubdiv for eval */
+
+struct OsdPatch : Patch {
+	OsdData* osd_data;
+
+	OsdPatch(OsdData* data) : osd_data(data) {}
+
+	void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v)
+	{
+		const Far::PatchTable::PatchHandle* handle = osd_data->patch_map->FindPatch(patch_index, u, v);
+		assert(handle);
+
+		float p_weights[20], du_weights[20], dv_weights[20];
+		osd_data->patch_table->EvaluateBasis(*handle, u, v, p_weights, du_weights, dv_weights);
+
+		Far::ConstIndexArray cv = osd_data->patch_table->GetPatchVertices(*handle);
+
+		float3 du, dv;
+		if(P) *P = make_float3(0.0f, 0.0f, 0.0f);
+		du = make_float3(0.0f, 0.0f, 0.0f);
+		dv = make_float3(0.0f, 0.0f, 0.0f);
+
+		for(int i = 0; i < cv.size(); i++) {
+			float3 p = osd_data->verts[cv[i]].value;
+
+			if(P) *P += p * p_weights[i];
+			du += p * du_weights[i];
+			dv += p * dv_weights[i];
+		}
+
+		if(dPdu) *dPdu = du;
+		if(dPdv) *dPdv = dv;
+		if(N) *N = normalize(cross(du, dv));
+	}
+
+	BoundBox bound() { return BoundBox::empty; }
+};
+
+#endif
+
+void Mesh::tessellate(DiagSplit *split)
+{
+#ifdef WITH_OPENSUBDIV
+	OsdData osd_data;
+	bool need_packed_patch_table = false;
+
+	if(subdivision_type == SUBDIVISION_CATMULL_CLARK) {
+		osd_data.build_from_mesh(this);
+	}
+	else
+#endif
+	{
+		/* force linear subdivision if OpenSubdiv is unavailable to avoid
+		 * falling into catmull-clark code paths by accident
+		 */
+		subdivision_type = SUBDIVISION_LINEAR;
+
+		/* force disable attribute subdivision for same reason as above */
+		foreach(Attribute& attr, subd_attributes.attributes) {
+			attr.flags &= ~ATTR_SUBDIVIDED;
+		}
+	}
+
+	int num_faces = subd_faces.size();
+
+	Attribute *attr_vN = subd_attributes.find(ATTR_STD_VERTEX_NORMAL);
+	float3* vN = attr_vN->data_float3();
+
+	for(int f = 0; f < num_faces; f++) {
+		SubdFace& face = subd_faces[f];
+
+		if(face.is_quad()) {
+			/* quad */
+			QuadDice::SubPatch subpatch;
+
+			LinearQuadPatch quad_patch;
+#ifdef WITH_OPENSUBDIV
+			OsdPatch osd_patch(&osd_data);
+
+			if(subdivision_type == SUBDIVISION_CATMULL_CLARK) {
+				osd_patch.patch_index = face.ptex_offset;
+
+				subpatch.patch = &osd_patch;
+			}
+			else
+#endif
+			{
+				float3 *hull = quad_patch.hull;
+				float3 *normals = quad_patch.normals;
+
+				quad_patch.patch_index = face.ptex_offset;
+
+				for(int i = 0; i < 4; i++) {
+					hull[i] = verts[subd_face_corners[face.start_corner+i]];
+				}
+
+				if(face.smooth) {
+					for(int i = 0; i < 4; i++) {
+						normals[i] = vN[subd_face_corners[face.start_corner+i]];
+					}
+				}
+				else {
+					float3 N = face.normal(this);
+					for(int i = 0; i < 4; i++) {
+						normals[i] = N;
+					}
+				}
+
+				swap(hull[2], hull[3]);
+				swap(normals[2], normals[3]);
+
+				subpatch.patch = &quad_patch;
+			}
+
+			subpatch.patch->shader = face.shader;
+
+			/* Quad faces need to be split at least once to line up with split ngons, we do this
+			 * here in this manner because if we do it later edge factors may end up slightly off.
+			 */
+			subpatch.P00 = make_float2(0.0f, 0.0f);
+			subpatch.P10 = make_float2(0.5f, 0.0f);
+			subpatch.P01 = make_float2(0.0f, 0.5f);
+			subpatch.P11 = make_float2(0.5f, 0.5f);
+			split->split_quad(subpatch.patch, &subpatch);
+
+			subpatch.P00 = make_float2(0.5f, 0.0f);
+			subpatch.P10 = make_float2(1.0f, 0.0f);
+			subpatch.P01 = make_float2(0.5f, 0.5f);
+			subpatch.P11 = make_float2(1.0f, 0.5f);
+			split->split_quad(subpatch.patch, &subpatch);
+
+			subpatch.P00 = make_float2(0.0f, 0.5f);
+			subpatch.P10 = make_float2(0.5f, 0.5f);
+			subpatch.P01 = make_float2(0.0f, 1.0f);
+			subpatch.P11 = make_float2(0.5f, 1.0f);
+			split->split_quad(subpatch.patch, &subpatch);
+
+			subpatch.P00 = make_float2(0.5f, 0.5f);
+			subpatch.P10 = make_float2(1.0f, 0.5f);
+			subpatch.P01 = make_float2(0.5f, 1.0f);
+			subpatch.P11 = make_float2(1.0f, 1.0f);
+			split->split_quad(subpatch.patch, &subpatch);
+		}
+		else {
+			/* ngon */
+#ifdef WITH_OPENSUBDIV
+			if(subdivision_type == SUBDIVISION_CATMULL_CLARK) {
+				OsdPatch patch(&osd_data);
+
+				patch.shader = face.shader;
+
+				for(int corner = 0; corner < face.num_corners; corner++) {
+					patch.patch_index = face.ptex_offset + corner;
+
+					split->split_quad(&patch);
+				}
+			}
+			else
+#endif
+			{
+				float3 center_vert = make_float3(0.0f, 0.0f, 0.0f);
+				float3 center_normal = make_float3(0.0f, 0.0f, 0.0f);
+
+				float inv_num_corners = 1.0f/float(face.num_corners);
+				for(int corner = 0; corner < face.num_corners; corner++) {
+					center_vert += verts[subd_face_corners[face.start_corner + corner]] * inv_num_corners;
+					center_normal += vN[subd_face_corners[face.start_corner + corner]] * inv_num_corners;
+				}
+
+				for(int corner = 0; corner < face.num_corners; corner++) {
+					LinearQuadPatch patch;
+					float3 *hull = patch.hull;
+					float3 *normals = patch.normals;
+
+					patch.patch_index = face.ptex_offset + corner;
+
+					patch.shader = face.shader;
+
+					hull[0] = verts[subd_face_corners[face.start_corner + mod(corner + 0, face.num_corners)]];
+					hull[1] = verts[subd_face_corners[face.start_corner + mod(corner + 1, face.num_corners)]];
+					hull[2] = verts[subd_face_corners[face.start_corner + mod(corner - 1, face.num_corners)]];
+					hull[3] = center_vert;
+
+					hull[1] = (hull[1] + hull[0]) * 0.5;
+					hull[2] = (hull[2] + hull[0]) * 0.5;
+
+					if(face.smooth) {
+						normals[0] = vN[subd_face_corners[face.start_corner + mod(corner + 0, face.num_corners)]];
+						normals[1] = vN[subd_face_corners[face.start_corner + mod(corner + 1, face.num_corners)]];
+						normals[2] = vN[subd_face_corners[face.start_corner + mod(corner - 1, face.num_corners)]];
+						normals[3] = center_normal;
+
+						normals[1] = (normals[1] + normals[0]) * 0.5;
+						normals[2] = (normals[2] + normals[0]) * 0.5;
+					}
+					else {
+						float3 N = face.normal(this);
+						for(int i = 0; i < 4; i++) {
+							normals[i] = N;
+						}
+					}
+
+					split->split_quad(&patch);
+				}
+			}
+		}
+	}
+
+	/* interpolate center points for attributes */
+	foreach(Attribute& attr, subd_attributes.attributes) {
+#ifdef WITH_OPENSUBDIV
+		if(subdivision_type == SUBDIVISION_CATMULL_CLARK && attr.flags & ATTR_SUBDIVIDED) {
+			if(attr.element == ATTR_ELEMENT_CORNER || attr.element == ATTR_ELEMENT_CORNER_BYTE) {
+				/* keep subdivision for corner attributes disabled for now */
+				attr.flags &= ~ATTR_SUBDIVIDED;
+			}
+			else {
+				osd_data.subdivide_attribute(attr);
+
+				need_packed_patch_table = true;
+				continue;
+			}
+		}
+#endif
+
+		char* data = attr.data();
+		size_t stride = attr.data_sizeof();
+		int ngons = 0;
+
+		switch(attr.element) {
+			case ATTR_ELEMENT_VERTEX: {
+				for(int f = 0; f < num_faces; f++) {
+					SubdFace& face = subd_faces[f];
+
+					if(!face.is_quad()) {
+						char* center = data + (verts.size() - num_subd_verts + ngons) * stride;
+						attr.zero_data(center);
+
+						float inv_num_corners = 1.0f / float(face.num_corners);
+
+						for(int corner = 0; corner < face.num_corners; corner++) {
+							attr.add_with_weight(center,
+							                     data + subd_face_corners[face.start_corner + corner] * stride,
+							                     inv_num_corners);
+						}
+
+						ngons++;
+					}
+				}
+			} break;
+			case ATTR_ELEMENT_VERTEX_MOTION: {
+				// TODO(mai): implement
+			} break;
+			case ATTR_ELEMENT_CORNER: {
+				for(int f = 0; f < num_faces; f++) {
+					SubdFace& face = subd_faces[f];
+
+					if(!face.is_quad()) {
+						char* center = data + (subd_face_corners.size() + ngons) * stride;
+						attr.zero_data(center);
+
+						float inv_num_corners = 1.0f / float(face.num_corners);
+
+						for(int corner = 0; corner < face.num_corners; corner++) {
+							attr.add_with_weight(center,
+							                     data + (face.start_corner + corner) * stride,
+							                     inv_num_corners);
+						}
+
+						ngons++;
+					}
+				}
+			} break;
+			case ATTR_ELEMENT_CORNER_BYTE: {
+				for(int f = 0; f < num_faces; f++) {
+					SubdFace& face = subd_faces[f];
+
+					if(!face.is_quad()) {
+						uchar* center = (uchar*)data + (subd_face_corners.size() + ngons) * stride;
+
+						float inv_num_corners = 1.0f / float(face.num_corners);
+						float4 val = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+						for(int corner = 0; corner < face.num_corners; corner++) {
+							for(int i = 0; i < 4; i++) {
+								val[i] += float(*(data + (face.start_corner + corner) * stride + i)) * inv_num_corners;
+							}
+						}
+
+						for(int i = 0; i < 4; i++) {
+							center[i] = uchar(min(max(val[i], 0.0f), 255.0f));
+						}
+
+						ngons++;
+					}
+				}
+			} break;
+			default: break;
+		}
+	}
+
+#ifdef WITH_OPENSUBDIV
+	/* pack patch tables */
+	if(need_packed_patch_table) {
+		delete patch_table;
+		patch_table = new PackedPatchTable;
+		patch_table->pack(osd_data.patch_table);
+	}
+#endif
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index e26084c690b..4f54b86fe4a 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -20,6 +20,7 @@
 #include "scene.h"
 #include "svm.h"
 #include "svm_color_util.h"
+#include "svm_ramp_util.h"
 #include "svm_math_util.h"
 #include "osl.h"
 #include "constant_fold.h"
@@ -1688,6 +1689,19 @@ void ConvertNode::constant_fold(const ConstantFolder& folder)
 			}
 		}
 	}
+	else {
+		ShaderInput *in = inputs[0];
+		ShaderNode *prev = in->link->parent;
+
+		/* no-op conversion of A to B to A */
+		if(prev->type == node_types[to][from]) {
+			ShaderInput *prev_in = prev->inputs[0];
+
+			if(SocketType::is_float3(from) && (to == SocketType::FLOAT || SocketType::is_float3(to)) && prev_in->link) {
+				folder.bypass(prev_in->link);
+			}
+		}
+	}
 }
 
 void ConvertNode::compile(SVMCompiler& compiler)
@@ -2295,6 +2309,7 @@ NODE_DEFINE(SubsurfaceScatteringNode)
 SubsurfaceScatteringNode::SubsurfaceScatteringNode()
 : BsdfNode(node_type)
 {
+	closure = falloff;
 }
 
 void SubsurfaceScatteringNode::compile(SVMCompiler& compiler)
@@ -2305,6 +2320,7 @@ void SubsurfaceScatteringNode::compile(SVMCompiler& compiler)
 
 void SubsurfaceScatteringNode::compile(OSLCompiler& compiler)
 {
+	closure = falloff;
 	compiler.parameter(this, "falloff");
 	compiler.add(this, "node_subsurface_scattering");
 }
@@ -2323,7 +2339,7 @@ NODE_DEFINE(EmissionNode)
 	NodeType* type = NodeType::add("emission", create, NodeType::SHADER);
 
 	SOCKET_IN_COLOR(color, "Color", make_float3(0.8f, 0.8f, 0.8f));
-	SOCKET_IN_FLOAT(strength, "Strength", 1.0f);
+	SOCKET_IN_FLOAT(strength, "Strength", 10.0f);
 	SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
 	SOCKET_OUT_CLOSURE(emission, "Emission");
@@ -2344,7 +2360,7 @@ void EmissionNode::compile(SVMCompiler& compiler)
 	if(color_in->link || strength_in->link) {
 		compiler.add_node(NODE_EMISSION_WEIGHT,
 		                  compiler.stack_assign(color_in),
-						  compiler.stack_assign(strength_in));
+		                  compiler.stack_assign(strength_in));
 	}
 	else
 		compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color * strength);
@@ -3698,44 +3714,11 @@ void MixNode::compile(OSLCompiler& compiler)
 
 void MixNode::constant_fold(const ConstantFolder& folder)
 {
-	ShaderInput *fac_in = input("Fac");
-	ShaderInput *color1_in = input("Color1");
-	ShaderInput *color2_in = input("Color2");
-
-	/* evaluate fully constant node */
 	if(folder.all_inputs_constant()) {
 		folder.make_constant_clamp(svm_mix(type, fac, color1, color2), use_clamp);
-		return;
-	}
-
-	/* remove no-op node when factor is 0.0 */
-	if(!fac_in->link && fac <= 0.0f) {
-		/* note that some of the modes will clamp out of bounds values even without use_clamp */
-		if(type == NODE_MIX_LIGHT || type == NODE_MIX_DODGE || type == NODE_MIX_BURN) {
-			if(!color1_in->link) {
-				folder.make_constant_clamp(svm_mix(type, 0.0f, color1, color1), use_clamp);
-				return;
-			}
-		}
-		else if(folder.try_bypass_or_make_constant(color1_in, color1, use_clamp)) {
-			return;
-		}
 	}
-
-	if(type == NODE_MIX_BLEND) {
-		/* remove useless mix colors nodes */
-		if(color1_in->link ? (color1_in->link == color2_in->link) : (!color2_in->link && color1 == color2)) {
-			if(folder.try_bypass_or_make_constant(color1_in, color1, use_clamp)) {
-				return;
-			}
-		}
-
-		/* remove no-op mix color node when factor is 1.0 */
-		if(!fac_in->link && fac >= 1.0f) {
-			if(folder.try_bypass_or_make_constant(color2_in, color2, use_clamp)) {
-				return;
-			}
-		}
+	else {
+		folder.fold_mix(type, use_clamp);
 	}
 }
 
@@ -4621,6 +4604,9 @@ void MathNode::constant_fold(const ConstantFolder& folder)
 	if(folder.all_inputs_constant()) {
 		folder.make_constant_clamp(svm_math(type, value1, value2), use_clamp);
 	}
+	else {
+		folder.fold_math(type, use_clamp);
+	}
 }
 
 void MathNode::compile(SVMCompiler& compiler)
@@ -4693,6 +4679,9 @@ void VectorMathNode::constant_fold(const ConstantFolder& folder)
 			folder.make_constant(vector);
 		}
 	}
+	else {
+		folder.fold_vector_math(type);
+	}
 }
 
 void VectorMathNode::compile(SVMCompiler& compiler)
@@ -4853,6 +4842,30 @@ CurvesNode::CurvesNode(const NodeType *node_type)
 {
 }
 
+void CurvesNode::constant_fold(const ConstantFolder& folder, ShaderInput *value_in)
+{
+	ShaderInput *fac_in = input("Fac");
+
+	/* remove no-op node */
+	if(!fac_in->link && fac == 0.0f) {
+		folder.bypass(value_in->link);
+	}
+	/* evaluate fully constant node */
+	else if(folder.all_inputs_constant()) {
+		if (curves.size() == 0)
+			return;
+
+		float3 pos = (value - make_float3(min_x, min_x, min_x)) / (max_x - min_x);
+		float3 result;
+
+		result[0] = rgb_ramp_lookup(curves.data(), pos[0], true, true, curves.size()).x;
+		result[1] = rgb_ramp_lookup(curves.data(), pos[1], true, true, curves.size()).y;
+		result[2] = rgb_ramp_lookup(curves.data(), pos[2], true, true, curves.size()).z;
+
+		folder.make_constant(interp(value, result, fac));
+	}
+}
+
 void CurvesNode::compile(SVMCompiler& compiler, int type, ShaderInput *value_in, ShaderOutput *value_out)
 {
 	if(curves.size() == 0)
@@ -4916,6 +4929,11 @@ RGBCurvesNode::RGBCurvesNode()
 {
 }
 
+void RGBCurvesNode::constant_fold(const ConstantFolder& folder)
+{
+	CurvesNode::constant_fold(folder, input("Color"));
+}
+
 void RGBCurvesNode::compile(SVMCompiler& compiler)
 {
 	CurvesNode::compile(compiler, NODE_RGB_CURVES, input("Color"), output("Color"));
@@ -4949,6 +4967,11 @@ VectorCurvesNode::VectorCurvesNode()
 {
 }
 
+void VectorCurvesNode::constant_fold(const ConstantFolder& folder)
+{
+	CurvesNode::constant_fold(folder, input("Vector"));
+}
+
 void VectorCurvesNode::compile(SVMCompiler& compiler)
 {
 	CurvesNode::compile(compiler, NODE_VECTOR_CURVES, input("Vector"), output("Vector"));
@@ -4982,6 +5005,31 @@ RGBRampNode::RGBRampNode()
 {
 }
 
+void RGBRampNode::constant_fold(const ConstantFolder& folder)
+{
+	if(ramp.size() == 0 || ramp.size() != ramp_alpha.size())
+		return;
+
+	if(folder.all_inputs_constant()) {
+		float f = clamp(fac, 0.0f, 1.0f) * (ramp.size() - 1);
+
+		/* clamp int as well in case of NaN */
+		int i = clamp((int)f, 0, ramp.size()-1);
+		float t = f - (float)i;
+
+		bool use_lerp = interpolate && t > 0.0f;
+
+		if(folder.output == output("Color")) {
+			float3 color = rgb_ramp_lookup(ramp.data(), fac, use_lerp, false, ramp.size());
+			folder.make_constant(color);
+		}
+		else if(folder.output == output("Alpha")) {
+			float alpha = float_ramp_lookup(ramp_alpha.data(), fac, use_lerp, false, ramp_alpha.size());
+			folder.make_constant(alpha);
+		}
+	}
+}
+
 void RGBRampNode::compile(SVMCompiler& compiler)
 {
 	if(ramp.size() == 0 || ramp.size() != ramp_alpha.size())
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index caad11af0f8..b0eb2395adf 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -152,8 +152,8 @@ class OutputNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(OutputNode)
 
-	void* surface;
-	void* volume;
+	void *surface;
+	void *volume;
 	float displacement;
 	float3 normal;
 
@@ -350,6 +350,7 @@ public:
 	float roughness, anisotropy, rotation;
 	ClosureType distribution;
 
+	ClosureType get_closure_type() { return distribution; }
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 };
 
@@ -385,6 +386,7 @@ public:
 
 	void simplify_settings(Scene *scene);
 	bool has_integrator_dependency();
+	ClosureType get_closure_type() { return distribution; }
 
 	float roughness;
 	ClosureType distribution, distribution_orig;
@@ -396,6 +398,7 @@ public:
 
 	void simplify_settings(Scene *scene);
 	bool has_integrator_dependency();
+	ClosureType get_closure_type() { return distribution; }
 
 	float roughness, IOR;
 	ClosureType distribution, distribution_orig;
@@ -407,6 +410,7 @@ public:
 
 	void simplify_settings(Scene *scene);
 	bool has_integrator_dependency();
+	ClosureType get_closure_type() { return distribution; }
 
 	float roughness, IOR;
 	ClosureType distribution, distribution_orig;
@@ -425,6 +429,7 @@ public:
 	SHADER_NODE_CLASS(SubsurfaceScatteringNode)
 	bool has_surface_bssrdf() { return true; }
 	bool has_bssrdf_bump();
+	ClosureType get_closure_type() { return falloff; }
 
 	float scale;
 	float3 radius;
@@ -519,6 +524,7 @@ public:
 class HairBsdfNode : public BsdfNode {
 public:
 	SHADER_NODE_CLASS(HairBsdfNode)
+	ClosureType get_closure_type() { return component; }
 
 	ClosureType component;
 	float offset;
@@ -883,28 +889,32 @@ public:
 
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
-	bool has_spatial_varying() { return true; }
-	void compile(SVMCompiler& compiler, int type, ShaderInput *value_in, ShaderOutput *value_out);
-	void compile(OSLCompiler& compiler, const char *name);
-
 	array<float3> curves;
 	float min_x, max_x, fac;
 	float3 value;
+
+protected:
+	void constant_fold(const ConstantFolder& folder, ShaderInput *value_in);
+	void compile(SVMCompiler& compiler, int type, ShaderInput *value_in, ShaderOutput *value_out);
+	void compile(OSLCompiler& compiler, const char *name);
 };
 
 class RGBCurvesNode : public CurvesNode {
 public:
 	SHADER_NODE_CLASS(RGBCurvesNode)
+	void constant_fold(const ConstantFolder& folder);
 };
 
 class VectorCurvesNode : public CurvesNode {
 public:
 	SHADER_NODE_CLASS(VectorCurvesNode)
+	void constant_fold(const ConstantFolder& folder);
 };
 
 class RGBRampNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(RGBRampNode)
+	void constant_fold(const ConstantFolder& folder);
 	virtual int get_group() { return NODE_GROUP_LEVEL_1; }
 
 	array<float3> ramp;
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 662d87e8b6b..62076f3a865 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -29,6 +29,8 @@
 #include "util_progress.h"
 #include "util_vector.h"
 
+#include "subd_patch_table.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Object */
@@ -55,9 +57,9 @@ Object::Object()
 	particle_system = NULL;
 	particle_index = 0;
 	bounds = BoundBox::empty;
-	motion.pre = transform_identity();
-	motion.mid = transform_identity();
-	motion.post = transform_identity();
+	motion.pre = transform_empty();
+	motion.mid = transform_empty();
+	motion.post = transform_empty();
 	use_motion = false;
 }
 
@@ -70,19 +72,28 @@ void Object::compute_bounds(bool motion_blur)
 	BoundBox mbounds = mesh->bounds;
 
 	if(motion_blur && use_motion) {
-		DecompMotionTransform decomp;
-		transform_motion_decompose(&decomp, &motion, &tfm);
+		if(motion.pre == transform_empty() ||
+		   motion.post == transform_empty()) {
+			/* Hide objects that have no valid previous or next transform, for
+			 * example particle that stop existing. TODO: add support for this
+			 * case in the kernel so we don't get render artifacts. */
+			bounds = BoundBox::empty;
+		}
+		else {
+			DecompMotionTransform decomp;
+			transform_motion_decompose(&decomp, &motion, &tfm);
 
-		bounds = BoundBox::empty;
+			bounds = BoundBox::empty;
 
-		/* todo: this is really terrible. according to pbrt there is a better
-		 * way to find this iteratively, but did not find implementation yet
-		 * or try to implement myself */
-		for(float t = 0.0f; t < 1.0f; t += (1.0f/128.0f)) {
-			Transform ttfm;
+			/* todo: this is really terrible. according to pbrt there is a better
+			 * way to find this iteratively, but did not find implementation yet
+			 * or try to implement myself */
+			for(float t = 0.0f; t < 1.0f; t += (1.0f/128.0f)) {
+				Transform ttfm;
 
-			transform_motion_interpolate(&ttfm, &decomp, t);
-			bounds.grow(mbounds.transformed(&ttfm));
+				transform_motion_interpolate(&ttfm, &decomp, t);
+				bounds.grow(mbounds.transformed(&ttfm));
+			}
 		}
 	}
 	else {
@@ -228,7 +239,7 @@ vector<float> Object::motion_times()
 bool Object::is_traceable()
 {
 	/* Mesh itself can be empty,can skip all such objects. */
-	if (bounds.size() == make_float3(0.0f, 0.0f, 0.0f)) {
+	if (!bounds.valid() || bounds.size() == make_float3(0.0f, 0.0f, 0.0f)) {
 		return false;
 	}
 	/* TODO(sergey): Check for mesh vertices/curves. visibility flags. */
@@ -337,6 +348,15 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 		Transform mtfm_pre = ob->motion.pre;
 		Transform mtfm_post = ob->motion.post;
 
+		/* In case of missing motion information for previous/next frame,
+		 * assume there is no motion. */
+		if(!ob->use_motion || mtfm_pre == transform_empty()) {
+			mtfm_pre = ob->tfm;
+		}
+		if(!ob->use_motion || mtfm_post == transform_empty()) {
+			mtfm_post = ob->tfm;
+		}
+
 		if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
 			mtfm_pre = mtfm_pre * itfm;
 			mtfm_post = mtfm_post * itfm;
@@ -589,6 +609,40 @@ void ObjectManager::device_update_flags(Device *device,
 	device->tex_alloc("__object_flag", dscene->object_flag);
 }
 
+void ObjectManager::device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene)
+{
+	if (scene->objects.size() == 0)
+		return;
+
+	uint4* objects = (uint4*)dscene->objects.get_data();
+
+	bool update = false;
+
+	int object_index = 0;
+	foreach(Object *object, scene->objects) {
+		int offset = object_index*OBJECT_SIZE + 11;
+
+		Mesh* mesh = object->mesh;
+
+		if(mesh->patch_table) {
+			uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() -
+			                           mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset;
+
+			if(objects[offset].x != patch_map_offset) {
+				objects[offset].x = patch_map_offset;
+				update = true;
+			}
+		}
+
+		object_index++;
+	}
+
+	if(update) {
+		device->tex_free(dscene->objects);
+		device->tex_alloc("__objects", dscene->objects);
+	}
+}
+
 void ObjectManager::device_free(Device *device, DeviceScene *dscene)
 {
 	device->tex_free(dscene->objects);
@@ -638,7 +692,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 		 * Could be solved by moving reference counter to Mesh.
 		 */
 		if((mesh_users[object->mesh] == 1 && !object->mesh->has_surface_bssrdf) &&
-		   object->mesh->displacement_method == Mesh::DISPLACE_BUMP)
+		   !object->mesh->has_true_displacement())
 		{
 			if(!(motion_blur && object->use_motion)) {
 				if(!object->mesh->transform_applied) {
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7ab73f3c91a..2e5837f672f 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -97,6 +97,8 @@ public:
 	                         Scene *scene,
 	                         Progress& progress,
 	                         bool bounds_valid = true);
+	void device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene);
+
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 676afad997e..1a6ae5f9277 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -549,7 +549,7 @@ string OSLCompiler::id(ShaderNode *node)
 {
 	/* assign layer unique name based on pointer address + bump mode */
 	stringstream stream;
-	stream << "node_" << node->name << "_" << node;
+	stream << "node_" << node->type->name << "_" << node;
 
 	return stream.str();
 }
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 05e807ff60c..9e72f197cce 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -74,10 +74,14 @@ public:
 	device_vector<uint> tri_shader;
 	device_vector<float4> tri_vnormal;
 	device_vector<uint4> tri_vindex;
+	device_vector<uint> tri_patch;
+	device_vector<float2> tri_patch_uv;
 
 	device_vector<float4> curves;
 	device_vector<float4> curve_keys;
 
+	device_vector<uint> patches;
+
 	/* objects */
 	device_vector<float4> objects;
 	device_vector<float4> objects_vector;
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 4cdb878df45..d000cca5a45 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -150,6 +150,12 @@ NODE_DEFINE(Shader)
 	volume_interpolation_method_enum.insert("cubic", VOLUME_INTERPOLATION_CUBIC);
 	SOCKET_ENUM(volume_interpolation_method, "Volume Interpolation Method", volume_interpolation_method_enum, VOLUME_INTERPOLATION_LINEAR);
 
+	static NodeEnum displacement_method_enum;
+	displacement_method_enum.insert("bump", DISPLACE_BUMP);
+	displacement_method_enum.insert("true", DISPLACE_TRUE);
+	displacement_method_enum.insert("both", DISPLACE_BOTH);
+	SOCKET_ENUM(displacement_method, "Displacement Method", displacement_method_enum, DISPLACE_BUMP);
+
 	return type;
 }
 
@@ -173,6 +179,8 @@ Shader::Shader()
 	has_object_dependency = false;
 	has_integrator_dependency = false;
 
+	displacement_method = DISPLACE_BUMP;
+
 	id = -1;
 	used = false;
 
@@ -310,7 +318,7 @@ int ShaderManager::get_shader_id(Shader *shader, Mesh *mesh, bool smooth)
 	int id = shader->id*2;
 	
 	/* index depends bump since this setting is not in the shader */
-	if(mesh && mesh->displacement_method != Mesh::DISPLACE_TRUE)
+	if(mesh && shader->displacement_method != DISPLACE_TRUE)
 		id += 1;
 	/* smooth flag */
 	if(smooth)
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index dc57ed4e4eb..060ad7056bc 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -66,6 +66,14 @@ enum VolumeInterpolation {
 	VOLUME_NUM_INTERPOLATION,
 };
 
+enum DisplacementMethod {
+	DISPLACE_BUMP = 0,
+	DISPLACE_TRUE = 1,
+	DISPLACE_BOTH = 2,
+
+	DISPLACE_NUM_METHODS,
+};
+
 /* Shader describing the appearance of a Mesh, Light or Background.
  *
  * While there is only a single shader graph, it has three outputs: surface,
@@ -110,6 +118,9 @@ public:
 	bool has_object_dependency;
 	bool has_integrator_dependency;
 
+	/* displacement */
+	DisplacementMethod displacement_method;
+
 	/* requested mesh attributes */
 	AttributeRequestSet attributes;
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index f0e7ee2bd49..1a166885e2b 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -65,20 +65,21 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 		svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 		svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 	}
-	
+
 	foreach(Shader *shader, scene->shaders) {
 		if(progress.get_cancel()) return;
 
 		assert(shader->graph);
 
-		if(shader->use_mis && shader->has_surface_emission)
-			scene->light_manager->need_update = true;
-
 		SVMCompiler::Summary summary;
 		SVMCompiler compiler(scene->shader_manager, scene->image_manager);
 		compiler.background = (shader == scene->default_background);
 		compiler.compile(scene, shader, svm_nodes, shader->id, &summary);
 
+		if(shader->use_mis && shader->has_surface_emission) {
+			scene->light_manager->need_update = true;
+		}
+
 		VLOG(2) << "Compilation summary:\n"
 		        << "Shader name: " << shader->name << "\n"
 		        << summary.full_report();
diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt
index d1708868fd0..9265299e82b 100644
--- a/intern/cycles/subd/CMakeLists.txt
+++ b/intern/cycles/subd/CMakeLists.txt
@@ -14,22 +14,17 @@ set(INC_SYS
 
 set(SRC
 	subd_dice.cpp
-	subd_mesh.cpp
 	subd_patch.cpp
 	subd_split.cpp
+	subd_patch_table.cpp
 )
 
 set(SRC_HEADERS
 	subd_dice.h
-	subd_mesh.h
 	subd_patch.h
 	subd_split.h
 )
 
-if(WITH_CYCLES_OPENSUBDIV)
-	add_definitions(-DWITH_OPENSUBDIV)
-endif()
-
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index 7c74f21950e..36981a20f3c 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -48,6 +48,11 @@ void EdgeDice::reserve(int num_verts)
 	vert_offset = mesh->verts.size();
 	tri_offset = mesh->num_triangles();
 
+	/* todo: optimize so we can reserve in advance, this is like push_back_slow() */
+	if(vert_offset + num_verts > mesh->verts.capacity()) {
+		mesh->reserve_mesh(size_t((vert_offset + num_verts) * 1.2), mesh->num_triangles());
+	}
+
 	mesh->resize_mesh(vert_offset + num_verts, tri_offset);
 
 	Attribute *attr_vN = mesh->attributes.add(ATTR_STD_VERTEX_NORMAL);
@@ -66,6 +71,7 @@ int EdgeDice::add_vert(Patch *patch, float2 uv)
 
 	mesh_P[vert_offset] = P;
 	mesh_N[vert_offset] = N;
+	params.mesh->vert_patch_uv[vert_offset] = make_float2(uv.x, uv.y);
 
 	if(params.ptex) {
 		Attribute *attr_ptex_uv = params.mesh->attributes.add(ATTR_STD_PTEX_UV);
@@ -75,6 +81,8 @@ int EdgeDice::add_vert(Patch *patch, float2 uv)
 		ptex_uv[vert_offset] = make_float3(uv.x, uv.y, 0.0f);
 	}
 
+	params.mesh->num_subd_verts++;
+
 	return vert_offset++;
 }
 
@@ -86,7 +94,8 @@ void EdgeDice::add_triangle(Patch *patch, int v0, int v1, int v2)
 	if(mesh->triangles.size() == mesh->triangles.capacity())
 		mesh->reserve_mesh(mesh->verts.size(), size_t(max(mesh->num_triangles() + 1, 1) * 1.2));
 
-	mesh->add_triangle(v0, v1, v2, params.shader, params.smooth, false);
+	mesh->add_triangle(v0, v1, v2, patch->shader, true);
+	params.mesh->triangle_patch[params.mesh->num_triangles()-1] = patch->patch_index;
 
 	if(params.ptex) {
 		Attribute *attr_ptex_face_id = params.mesh->attributes.add(ATTR_STD_PTEX_FACE_ID);
@@ -340,160 +349,5 @@ void QuadDice::dice(SubPatch& sub, EdgeFactors& ef)
 	assert(vert_offset == params.mesh->verts.size());
 }
 
-/* TriangleDice */
-
-TriangleDice::TriangleDice(const SubdParams& params_)
-: EdgeDice(params_)
-{
-}
-
-void TriangleDice::reserve(EdgeFactors& ef, int M)
-{
-	int num_verts = ef.tu + ef.tv + ef.tw;
-
-	for(int m = M-2; m > 0; m -= 2)
-		num_verts += 3 + (m-1)*3;
-	
-	if(!(M & 1))
-		num_verts++;
-	
-	EdgeDice::reserve(num_verts);
-}
-
-float2 TriangleDice::map_uv(SubPatch& sub, float2 uv)
-{
-	/* map UV from subpatch to patch parametric coordinates */
-	return uv.x*sub.Pu + uv.y*sub.Pv + (1.0f - uv.x - uv.y)*sub.Pw;
-}
-
-int TriangleDice::add_vert(SubPatch& sub, float2 uv)
-{
-	return EdgeDice::add_vert(sub.patch, map_uv(sub, uv));
-}
-
-void TriangleDice::add_grid(SubPatch& sub, EdgeFactors& ef, int M)
-{
-	// XXX normals are flipped, why?
-
-	/* grid is constructed starting from the outside edges, and adding
-	 * progressively smaller inner triangles that connected to the outer
-	 * one, until M = 1 or 2, the we fill up the last part. */
-	vector<int> outer_u, outer_v, outer_w;
-	int m;
-
-	/* add outer corners vertices */
-	{
-		float2 p_u = make_float2(1.0f, 0.0f);
-		float2 p_v = make_float2(0.0f, 1.0f);
-		float2 p_w = make_float2(0.0f, 0.0f);
-
-		int corner_u = add_vert(sub, p_u);
-		int corner_v = add_vert(sub, p_v);
-		int corner_w = add_vert(sub, p_w);
-
-		outer_u.push_back(corner_v);
-		outer_v.push_back(corner_w);
-		outer_w.push_back(corner_u);
-
-		for(int i = 1; i < ef.tu; i++)
-			outer_u.push_back(add_vert(sub, interp(p_v, p_w, i/(float)ef.tu)));
-		for(int i = 1; i < ef.tv; i++)
-			outer_v.push_back(add_vert(sub, interp(p_w, p_u, i/(float)ef.tv)));
-		for(int i = 1; i < ef.tw; i++)
-			outer_w.push_back(add_vert(sub, interp(p_u, p_v, i/(float)ef.tw)));
-
-		outer_u.push_back(corner_w);
-		outer_v.push_back(corner_u);
-		outer_w.push_back(corner_v);
-	}
-
-	for(m = M-2; m > 0; m -= 2) {
-		vector<int> inner_u, inner_v, inner_w;
-
-		const float t0 = m / (float)M;
-		float2 center = make_float2(1.0f/3.0f, 1.0f/3.0f);
-
-		/* 3 corner vertices */
-		float2 p_u = interp(center, make_float2(1.0f, 0.0f), t0);
-		float2 p_v = interp(center, make_float2(0.0f, 1.0f), t0);
-		float2 p_w = interp(center, make_float2(0.0f, 0.0f), t0);
-
-		int corner_u = add_vert(sub, p_u);
-		int corner_v = add_vert(sub, p_v);
-		int corner_w = add_vert(sub, p_w);
-
-		/* construct array of vertex indices for each side */
-		inner_u.push_back(corner_v);
-		inner_v.push_back(corner_w);
-		inner_w.push_back(corner_u);
-
-		for(int i = 1; i < m; i++) {
-			/* add vertices between corners */
-			const float t1 = i / (float)m;
-
-			inner_u.push_back(add_vert(sub, interp(p_v, p_w, t1)));
-			inner_v.push_back(add_vert(sub, interp(p_w, p_u, t1)));
-			inner_w.push_back(add_vert(sub, interp(p_u, p_v, t1)));
-		}
-
-		inner_u.push_back(corner_w);
-		inner_v.push_back(corner_u);
-		inner_w.push_back(corner_v);
-
-		/* stitch together inner/outer with triangles */
-		stitch_triangles(sub.patch, outer_u, inner_u);
-		stitch_triangles(sub.patch, outer_v, inner_v);
-		stitch_triangles(sub.patch, outer_w, inner_w);
-
-		outer_u = inner_u;
-		outer_v = inner_v;
-		outer_w = inner_w;
-	}
-
-	/* fill up last part */
-	if(m == -1) {
-		/* single triangle */
-		add_triangle(sub.patch, outer_w[0], outer_u[0], outer_v[0]);
-	}
-	else {
-		/* center vertex + up to 6 triangles */
-		int center = add_vert(sub, make_float2(1.0f/3.0f, 1.0f/3.0f));
-
-		add_triangle(sub.patch, outer_w[0], outer_w[1], center);
-		/* if this is false then there is only one triangle on this side */
-		if(outer_w.size() > 2)
-			add_triangle(sub.patch, outer_w[1], outer_w[2], center);
-
-		add_triangle(sub.patch, outer_u[0], outer_u[1], center);
-		if(outer_u.size() > 2)
-			add_triangle(sub.patch, outer_u[1], outer_u[2], center);
-
-		add_triangle(sub.patch, outer_v[0], outer_v[1], center);
-		if(outer_v.size() > 2)
-			add_triangle(sub.patch, outer_v[1], outer_v[2], center);
-	}
-}
-
-void TriangleDice::dice(SubPatch& sub, EdgeFactors& ef)
-{
-	/* todo: handle 2 1 1 resolution */
-	int M = max(ef.tu, max(ef.tv, ef.tw));
-
-	/* Due to the "slant" of the edges of a triangle compared to a quad, the internal
-	 * triangles end up smaller, causing over-tessellation. This is to correct for this
-	 * difference in area. Technically its only correct for equilateral triangles, but
-	 * its better than how it was.
-	 *
-	 * (2*cos(radians(30))/3)**0.5
-	 */
-	float S = 0.7598356856515927f;
-	M = max((int)ceil(S*M), 1);
-
-	reserve(ef, M);
-	add_grid(sub, ef, M);
-
-	assert(vert_offset == params.mesh->verts.size());
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 85bd0ea28f0..3002ec780e8 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -33,8 +33,6 @@ class Patch;
 
 struct SubdParams {
 	Mesh *mesh;
-	int shader;
-	bool smooth;
 	bool ptex;
 
 	int test_steps;
@@ -44,11 +42,9 @@ struct SubdParams {
 	Camera *camera;
 	Transform objecttoworld;
 
-	SubdParams(Mesh *mesh_, int shader_, bool smooth_ = true, bool ptex_ = false)
+	SubdParams(Mesh *mesh_, bool ptex_ = false)
 	{
 		mesh = mesh_;
-		shader = shader_;
-		smooth = smooth_;
 		ptex = ptex_;
 
 		test_steps = 3;
@@ -136,46 +132,6 @@ public:
 	void dice(SubPatch& sub, EdgeFactors& ef);
 };
 
-/* Triangle EdgeDice
- *
- * Edge tessellation factors and subpatch coordinates are as follows:
- *
- *        Pw
- *        /\
- *    tv /  \ tu
- *      /    \
- *     /      \
- *  Pu -------- Pv
- *        tw     
- */
-
-class TriangleDice : public EdgeDice {
-public:
-	struct SubPatch {
-		Patch *patch;
-
-		float2 Pu;
-		float2 Pv;
-		float2 Pw;
-	};
-
-	struct EdgeFactors {
-		int tu;
-		int tv;
-		int tw;
-	};
-
-	explicit TriangleDice(const SubdParams& params);
-
-	void reserve(EdgeFactors& ef, int M);
-
-	float2 map_uv(SubPatch& sub, float2 uv);
-	int add_vert(SubPatch& sub, float2 uv);
-
-	void add_grid(SubPatch& sub, EdgeFactors& ef, int M);
-	void dice(SubPatch& sub, EdgeFactors& ef);
-};
-
 CCL_NAMESPACE_END
 
 #endif /* __SUBD_DICE_H__ */
diff --git a/intern/cycles/subd/subd_mesh.cpp b/intern/cycles/subd/subd_mesh.cpp
deleted file mode 100644
index 56d7d2b2303..00000000000
--- a/intern/cycles/subd/subd_mesh.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Original code in the public domain -- castanyo@yahoo.es
- * 
- * Modifications copyright (c) 2011, Blender Foundation.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-
-#include "subd_mesh.h"
-#include "subd_patch.h"
-#include "subd_split.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-
-#ifdef WITH_OPENSUBDIV
-
-#include <osd/vertex.h>
-#include <osd/mesh.h>
-#include <osd/cpuComputeController.h>
-#include <osd/cpuVertexBuffer.h>
-#include <osd/cpuEvalLimitController.h>
-#include <osd/evalLimitContext.h>
-
-CCL_NAMESPACE_BEGIN
-
-/* typedefs */
-typedef OpenSubdiv::OsdVertex OsdVertex;
-typedef OpenSubdiv::FarMesh<OsdVertex> OsdFarMesh;
-typedef OpenSubdiv::FarMeshFactory<OsdVertex> OsdFarMeshFactory;
-typedef OpenSubdiv::HbrCatmarkSubdivision<OsdVertex> OsdHbrCatmarkSubdivision;
-typedef OpenSubdiv::HbrFace<OsdVertex> OsdHbrFace;
-typedef OpenSubdiv::HbrHalfedge<OsdVertex> OsdHbrHalfEdge;
-typedef OpenSubdiv::HbrMesh<OsdVertex> OsdHbrMesh;
-typedef OpenSubdiv::HbrVertex<OsdVertex> OsdHbrVertex;
-typedef OpenSubdiv::OsdCpuComputeContext OsdCpuComputeContext;
-typedef OpenSubdiv::OsdCpuComputeController OsdCpuComputeController;
-typedef OpenSubdiv::OsdCpuEvalLimitContext OsdCpuEvalLimitContext;
-typedef OpenSubdiv::OsdCpuEvalLimitController OsdCpuEvalLimitController;
-typedef OpenSubdiv::OsdCpuVertexBuffer OsdCpuVertexBuffer;
-typedef OpenSubdiv::OsdEvalCoords OsdEvalCoords;
-typedef OpenSubdiv::OsdVertexBufferDescriptor OsdVertexBufferDescriptor;
-
-/* OpenSubdiv Patch */
-
-class OpenSubdPatch : public Patch {
-public:
-	int face_id;
-
-	OpenSubdPatch(OsdFarMesh *farmesh, OsdCpuVertexBuffer *vbuf_base)
-	{
-		face_id = 0;
-
-		/* create buffers for evaluation */
-		vbuf_P = OsdCpuVertexBuffer::Create(3, 1);
-		vbuf_dPdu = OsdCpuVertexBuffer::Create(3, 1);
-		vbuf_dPdv = OsdCpuVertexBuffer::Create(3, 1);
-
-		P = vbuf_P->BindCpuBuffer();
-		dPdu = vbuf_dPdu->BindCpuBuffer();
-		dPdv = vbuf_dPdv->BindCpuBuffer();
-
-		/* setup evaluation context */
-		OsdVertexBufferDescriptor in_desc(0, 3, 3), out_desc(0, 3, 3); /* offset, length, stride */
-
-		evalctx = OsdCpuEvalLimitContext::Create(farmesh, false);
-		evalctx->GetVertexData().Bind(in_desc, vbuf_base, out_desc, vbuf_P, vbuf_dPdu, vbuf_dPdv);
-	}
-
-	~OpenSubdPatch()
-	{
-		evalctx->GetVertexData().Unbind();
-
-		delete evalctx;
-		delete vbuf_P;
-		delete vbuf_dPdu;
-		delete vbuf_dPdv;
-	}
-
-	void eval(float3 *P_, float3 *dPdu_, float3 *dPdv_, float u, float v)
-	{
-		OsdEvalCoords coords;
-		coords.u = u;
-		coords.v = v;
-		coords.face = face_id;
-
-		evalctrl.EvalLimitSample<OsdCpuVertexBuffer,OsdCpuVertexBuffer>(coords, evalctx, 0);
-
-		*P_ = make_float3(P[0], P[1], P[2]);
-		if(dPdu_) *dPdu_ = make_float3(dPdv[0], dPdv[1], dPdv[2]);
-		if(dPdv_) *dPdv_ = make_float3(dPdu[0], dPdu[1], dPdu[2]);
-
-		/* optimize: skip evaluating derivatives when not needed */
-		/* todo: swapped derivatives, different winding convention? */
-	}
-
-	BoundBox bound()
-	{
-		/* not implemented */
-		BoundBox bbox = BoundBox::empty;
-		return bbox;
-	}
-
-	int ptex_face_id()
-	{
-		return face_id;
-	}
-
-protected:
-	OsdCpuEvalLimitController evalctrl;
-	OsdCpuEvalLimitContext *evalctx;
-	OsdCpuVertexBuffer *vbuf_P;
-	OsdCpuVertexBuffer *vbuf_dPdu;
-	OsdCpuVertexBuffer *vbuf_dPdv;
-	float *P;
-	float *dPdu;
-	float *dPdv;
-};
-
-/* OpenSubdiv Mesh */
-
-OpenSubdMesh::OpenSubdMesh()
-{
-	/* create osd mesh */
-	static OsdHbrCatmarkSubdivision	catmark;
-	OsdHbrMesh *hbrmesh = new OsdHbrMesh(&catmark);
-
-	/* initialize class */
-	num_verts = 0;
-	num_ptex_faces = 0;
-	_hbrmesh = (void*)hbrmesh;
-}
-
-OpenSubdMesh::~OpenSubdMesh()
-{
-	OsdHbrMesh *hbrmesh = (OsdHbrMesh*)_hbrmesh;
-
-	if(hbrmesh)
-		delete hbrmesh;
-}
-
-void OpenSubdMesh::add_vert(const float3& co)
-{
-	OsdHbrMesh *hbrmesh = (OsdHbrMesh*)_hbrmesh;
-
-	OsdVertex v;
-	positions.push_back(co.x);
-	positions.push_back(co.y);
-	positions.push_back(co.z);
-	hbrmesh->NewVertex(num_verts++, v);
-}
-
-void OpenSubdMesh::add_face(int v0, int v1, int v2)
-{
-	int index[3] = {v0, v1, v2};
-	return add_face(index, 3);
-}
-
-void OpenSubdMesh::add_face(int v0, int v1, int v2, int v3)
-{
-	int index[4] = {v0, v1, v2, v3};
-	add_face(index, 4);
-}
-
-void OpenSubdMesh::add_face(int *index, int num)
-{
-	OsdHbrMesh *hbrmesh = (OsdHbrMesh*)_hbrmesh;
-
-#ifndef NDEBUG
-	/* sanity checks */
-	for(int j = 0; j < num; j++) {
-		OsdHbrVertex *origin = hbrmesh->GetVertex(index[j]);
-		OsdHbrVertex *destination = hbrmesh->GetVertex(index[(j+1)%num]);
-		OsdHbrHalfEdge *opposite = destination->GetEdge(origin);
-
-		if(origin==NULL || destination==NULL)
-			assert(!"An edge was specified that connected a nonexistent vertex\n");
-
-		if(origin == destination)
-			assert(!"An edge was specified that connected a vertex to itself\n");
-
-		if(opposite && opposite->GetOpposite())
-			assert(!"A non-manifold edge incident to more than 2 faces was found\n");
-
-		if(origin->GetEdge(destination)) {
-			assert(!"An edge connecting two vertices was specified more than once."
-		                "It's likely that an incident face was flipped\n");
-		}
-	}
-#endif
-
-	OsdHbrFace *face = hbrmesh->NewFace(num, index, 0);
-
-	/* this is required for limit eval patch table? */
-	face->SetPtexIndex(num_ptex_faces);
-
-	if(num == 4)
-		num_ptex_faces++;
-	else
-		num_ptex_faces += num;
-}
-
-bool OpenSubdMesh::finish()
-{
-	OsdHbrMesh *hbrmesh = (OsdHbrMesh*)_hbrmesh;
-
-	/* finish hbr mesh construction */
-	hbrmesh->SetInterpolateBoundaryMethod(OsdHbrMesh::k_InterpolateBoundaryEdgeOnly);
-	hbrmesh->Finish();
-
-	return true;
-}
-
-void OpenSubdMesh::tessellate(DiagSplit *split)
-{
-	if(num_ptex_faces == 0)
-		return;
-
-	const int level = 3;
-	const bool requirefvar = false;
-
-	/* convert HRB to FAR mesh */
-	OsdHbrMesh *hbrmesh = (OsdHbrMesh*)_hbrmesh;
-
-	OsdFarMeshFactory meshFactory(hbrmesh, level, true);
-	OsdFarMesh *farmesh = meshFactory.Create(requirefvar);
-	int num_hbr_verts = hbrmesh->GetNumVertices();
-
-	delete hbrmesh;
-	hbrmesh = NULL;
-	_hbrmesh = NULL;
-
-	/* refine HBR mesh with vertex coordinates */
-	OsdCpuComputeController *compute_controller = new OsdCpuComputeController();
-	OsdCpuComputeContext *compute_context = OsdCpuComputeContext::Create(farmesh);
-
-	OsdCpuVertexBuffer *vbuf_base = OsdCpuVertexBuffer::Create(3, num_hbr_verts);
-	vbuf_base->UpdateData(&positions[0], 0, num_verts);
-
-	compute_controller->Refine(compute_context, farmesh->GetKernelBatches(), vbuf_base);
-	compute_controller->Synchronize();
-
-	/* split & dice patches */
-	OpenSubdPatch patch(farmesh, vbuf_base);
-
-	for(int f = 0; f < num_ptex_faces; f++) {
-		patch.face_id = f;
-		split->split_quad(&patch);
-	}
-
-	/* clean up */
-	delete farmesh;
-	delete compute_controller;
-	delete compute_context;
-	delete vbuf_base;
-}
-
-CCL_NAMESPACE_END
-
-#else /* WITH_OPENSUBDIV */
-
-CCL_NAMESPACE_BEGIN
-
-/* Subd Vertex */
-
-class SubdVert
-{
-public:
-	int id;
-	float3 co;
-	
-	explicit SubdVert(int id_)
-	{
-		id = id_;
-		co = make_float3(0.0f, 0.0f, 0.0f);
-	}
-};
-
-/* Subd Face */
-
-class SubdFace
-{
-public:
-	int id;
-	int numverts;
-	int verts[4];
-
-	explicit SubdFace(int id_)
-	{
-		id = id_;
-		numverts = 0;
-	}
-};
-
-/* Subd Mesh */
-
-SubdMesh::SubdMesh()
-{
-}
-
-SubdMesh::~SubdMesh()
-{
-	foreach(SubdVert *vertex, verts)
-		delete vertex;
-	foreach(SubdFace *face, faces)
-		delete face;
-
-	verts.clear();
-	faces.clear();
-}
-
-SubdVert *SubdMesh::add_vert(const float3& co)
-{
-	SubdVert *v = new SubdVert(verts.size());
-	v->co = co;
-	verts.push_back(v);
-
-	return v;
-}
-
-SubdFace *SubdMesh::add_face(int v0, int v1, int v2)
-{
-	int index[3] = {v0, v1, v2};
-	return add_face(index, 3);
-}
-
-SubdFace *SubdMesh::add_face(int v0, int v1, int v2, int v3)
-{
-	int index[4] = {v0, v1, v2, v3};
-	return add_face(index, 4);
-}
-
-SubdFace *SubdMesh::add_face(int *index, int num)
-{
-	/* skip ngons */
-	if(num < 3 || num > 4)
-		return NULL;
-
-	SubdFace *f = new SubdFace(faces.size());
-
-	for(int i = 0; i < num; i++)
-		f->verts[i] = index[i];
-
-	f->numverts = num;
-	faces.push_back(f);
-
-	return f;
-}
-
-bool SubdMesh::finish()
-{
-	return true;
-}
-
-void SubdMesh::tessellate(DiagSplit *split)
-{
-	int num_faces = faces.size();
-		        
-	for(int f = 0; f < num_faces; f++) {
-		SubdFace *face = faces[f];
-		Patch *patch;
-		float3 *hull;
-
-		if(face->numverts == 3) {
-			LinearTrianglePatch *lpatch = new LinearTrianglePatch();
-			hull = lpatch->hull;
-			patch = lpatch;
-		}
-		else if(face->numverts == 4) {
-			LinearQuadPatch *lpatch = new LinearQuadPatch();
-			hull = lpatch->hull;
-			patch = lpatch;
-		}
-		else {
-			assert(0); /* n-gons should have been split already */
-			continue;
-		}
-
-		for(int i = 0; i < face->numverts; i++)
-			hull[i] = verts[face->verts[i]]->co;
-
-		if(face->numverts == 4)
-			swap(hull[2], hull[3]);
-
-		if(patch->is_triangle())
-			split->split_triangle(patch);
-		else
-			split->split_quad(patch);
-
-		delete patch;
-	}
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENSUBDIV */
-
diff --git a/intern/cycles/subd/subd_mesh.h b/intern/cycles/subd/subd_mesh.h
deleted file mode 100644
index f6aefc20318..00000000000
--- a/intern/cycles/subd/subd_mesh.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Original code in the public domain -- castanyo@yahoo.es
- *
- * Modifications copyright (c) 2011, Blender Foundation.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __SUBD_MESH_H__
-#define __SUBD_MESH_H__
-
-#include "util_map.h"
-#include "util_types.h"
-#include "util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef WITH_OPENSUBDIV
-class SubdVert;
-class SubdFace;
-#endif
-
-class DiagSplit;
-class Mesh;
-
-/* Subd Mesh with simple linear subdivision */
-
-class SubdMesh
-{
-public:
-	SubdMesh();
-	~SubdMesh();
-
-	SubdVert *add_vert(const float3& co);
-	
-	SubdFace *add_face(int v0, int v1, int v2);
-	SubdFace *add_face(int v0, int v1, int v2, int v3);
-	SubdFace *add_face(int *index, int num);
-
-	bool finish();
-	void tessellate(DiagSplit *split);
-
-protected:
-#ifdef WITH_OPENSUBDIV
-	void *_hbrmesh;
-	vector<float> positions;
-	int num_verts, num_ptex_faces;
-#else
-	vector<SubdVert*> verts;
-	vector<SubdFace*> faces;
-#endif
-
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __SUBD_MESH_H__ */
-
diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp
index 60a78016054..d3319c5ccf5 100644
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@@ -84,32 +84,6 @@ BoundBox LinearQuadPatch::bound()
 	return bbox;
 }
 
-/* Linear Triangle Patch */
-
-void LinearTrianglePatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v)
-{
-	*P = u*hull[0] + v*hull[1] + (1.0f - u - v)*hull[2];
-
-	if(dPdu && dPdv) {
-		*dPdu = hull[0] - hull[2];
-		*dPdv = hull[1] - hull[2];
-	}
-
-	if(N) {
-		*N = normalize(u*normals[0] + v*normals[1] + (1.0f - u - v)*normals[2]);
-	}
-}
-
-BoundBox LinearTrianglePatch::bound()
-{
-	BoundBox bbox = BoundBox::empty;
-
-	for(int i = 0; i < 3; i++)
-		bbox.grow(hull[i]);
-	
-	return bbox;
-}
-
 /* Bicubic Patch */
 
 void BicubicPatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v)
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index bfa04412c66..360c1abf27b 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -26,9 +26,11 @@ class Patch {
 public:
 	virtual ~Patch() {}
 	virtual void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v) = 0;
-	virtual bool is_triangle() { return false; }
 	virtual BoundBox bound() = 0;
 	virtual int ptex_face_id() { return -1; }
+
+	int patch_index;
+	int shader;
 };
 
 /* Linear Quad Patch */
@@ -39,19 +41,6 @@ public:
 	float3 normals[4];
 
 	void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v);
-	bool is_triangle() { return false; }
-	BoundBox bound();
-};
-
-/* Linear Triangle Patch */
-
-class LinearTrianglePatch : public Patch {
-public:
-	float3 hull[3];
-	float3 normals[3];
-
-	void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v);
-	bool is_triangle() { return true; }
 	BoundBox bound();
 };
 
@@ -62,7 +51,6 @@ public:
 	float3 hull[16];
 
 	void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v);
-	bool is_triangle() { return false; }
 	BoundBox bound();
 };
 
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
new file mode 100644
index 00000000000..68ec1b2c6a6
--- /dev/null
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -0,0 +1,297 @@
+/*
+ * Based on code from OpenSubdiv released under this license:
+ *
+ * Copyright 2014 DreamWorks Animation LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "Apache License")
+ * with the following modification; you may not use this file except in
+ * compliance with the Apache License and the following modification to it:
+ * Section 6. Trademarks. is deleted and replaced with:
+ *
+ * 6. Trademarks. This License does not grant permission to use the trade
+ *   names, trademarks, service marks, or product names of the Licensor
+ *   and its affiliates, except as required to comply with Section 4(c) of
+ *   the License and to reproduce the content of the NOTICE file.
+ *
+ * You may obtain a copy of the Apache License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Apache License with the above modification is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the Apache License for the specific
+ * language governing permissions and limitations under the Apache License.
+ *
+ */
+
+#include "subd_patch_table.h"
+#include "kernel_types.h"
+
+#include "util_math.h"
+
+#ifdef WITH_OPENSUBDIV
+#include <opensubdiv/far/patchTable.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_OPENSUBDIV
+
+using namespace OpenSubdiv;
+
+/* functions for building patch maps */
+
+struct PatchMapQuadNode {
+	/* sets all the children to point to the patch of index */
+	void set_child(int index)
+	{
+		for (int i = 0; i < 4; i++) {
+			children[i] = index | PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF;
+		}
+	}
+
+	/* sets the child in quadrant to point to the node or patch of the given index */
+	void set_child(unsigned char quadrant, int index, bool is_leaf=true)
+	{
+		assert(quadrant < 4);
+		children[quadrant] = index | PATCH_MAP_NODE_IS_SET | (is_leaf ? PATCH_MAP_NODE_IS_LEAF : 0);
+	}
+
+	uint children[4];
+};
+
+template<class T>
+static int resolve_quadrant(T& median, T& u, T& v)
+{
+	int quadrant = -1;
+
+	if(u < median) {
+		if(v < median) {
+			quadrant = 0;
+		}
+		else {
+			quadrant = 1;
+			v -= median;
+		}
+	}
+	else {
+		if(v < median) {
+			quadrant = 3;
+		}
+		else {
+			quadrant = 2;
+			v -= median;
+		}
+		u -= median;
+	}
+
+	return quadrant;
+}
+
+static void build_patch_map(PackedPatchTable& table, OpenSubdiv::Far::PatchTable* patch_table, int offset)
+{
+	int num_faces = 0;
+
+	for(int array = 0; array < table.num_arrays; array++) {
+		Far::ConstPatchParamArray params = patch_table->GetPatchParams(array);
+
+		for(int j = 0; j < patch_table->GetNumPatches(array); j++) {
+			num_faces = max(num_faces, (int)params[j].GetFaceId());
+		}
+	}
+	num_faces++;
+
+	vector<PatchMapQuadNode> quadtree;
+	quadtree.reserve(num_faces + table.num_patches);
+	quadtree.resize(num_faces);
+
+	/* adjust offsets to make indices relative to the table */
+	int handle_index = -(table.num_patches * PATCH_HANDLE_SIZE);
+	offset += table.total_size();
+
+	/* populate the quadtree from the FarPatchArrays sub-patches */
+	for(int array = 0; array < table.num_arrays; array++) {
+		Far::ConstPatchParamArray params = patch_table->GetPatchParams(array);
+
+		for(int i = 0; i < patch_table->GetNumPatches(array); i++, handle_index += PATCH_HANDLE_SIZE) {
+			const Far::PatchParam& param = params[i];
+			unsigned short depth = param.GetDepth();
+
+			PatchMapQuadNode* node = &quadtree[params[i].GetFaceId()];
+
+			if(depth == (param.NonQuadRoot() ? 1 : 0)) {
+				/* special case : regular BSpline face w/ no sub-patches */
+				node->set_child(handle_index + offset);
+				continue;
+			}
+
+			int u = param.GetU();
+			int v = param.GetV();
+			int pdepth = param.NonQuadRoot() ? depth-2 : depth-1;
+			int half = 1 << pdepth;
+
+			for(int j = 0; j < depth; j++) {
+				int delta = half >> 1;
+
+				int quadrant = resolve_quadrant(half, u, v);
+				assert(quadrant >= 0);
+
+				half = delta;
+
+				if(j == pdepth) {
+					/* we have reached the depth of the sub-patch : add a leaf */
+					assert(!(node->children[quadrant] & PATCH_MAP_NODE_IS_SET));
+					node->set_child(quadrant, handle_index + offset, true);
+					break;
+				}
+				else {
+					/* travel down the child node of the corresponding quadrant */
+					if(!(node->children[quadrant] & PATCH_MAP_NODE_IS_SET)) {
+						/* create a new branch in the quadrant */
+						quadtree.push_back(PatchMapQuadNode());
+
+						int idx = (int)quadtree.size() - 1;
+						node->set_child(quadrant, idx*4 + offset, false);
+
+						node = &quadtree[idx];
+					}
+					else {
+						/* travel down an existing branch */
+						uint idx = node->children[quadrant] & PATCH_MAP_NODE_INDEX_MASK;
+						node = &(quadtree[(idx - offset)/4]);
+					}
+				}
+			}
+		}
+	}
+
+	/* copy into table */
+	assert(table.table.size() == table.total_size());
+	uint map_offset = table.total_size();
+
+	table.num_nodes = quadtree.size() * 4;
+	table.table.resize(table.total_size());
+
+	uint* data = &table.table[map_offset];
+
+	for(int i = 0; i < quadtree.size(); i++) {
+		for(int j = 0; j < 4; j++) {
+			assert(quadtree[i].children[j] & PATCH_MAP_NODE_IS_SET);
+			*(data++) = quadtree[i].children[j];
+		}
+	}
+}
+
+#endif
+
+/* packed patch table functions */
+
+size_t PackedPatchTable::total_size()
+{
+	return num_arrays * PATCH_ARRAY_SIZE +
+		   num_indices +
+		   num_patches * (PATCH_PARAM_SIZE + PATCH_HANDLE_SIZE) +
+		   num_nodes * PATCH_NODE_SIZE;
+}
+
+void PackedPatchTable::pack(Far::PatchTable* patch_table, int offset)
+{
+	num_arrays = 0;
+	num_patches = 0;
+	num_indices = 0;
+	num_nodes = 0;
+
+#ifdef WITH_OPENSUBDIV
+	num_arrays = patch_table->GetNumPatchArrays();
+
+	for(int i = 0; i < num_arrays; i++) {
+		int patches = patch_table->GetNumPatches(i);
+		int num_control = patch_table->GetPatchArrayDescriptor(i).GetNumControlVertices();
+
+		num_patches += patches;
+		num_indices += patches * num_control;
+	}
+
+	table.resize(total_size());
+	uint* data = &table[0];
+
+	uint* array = data;
+	uint* index = array + num_arrays * PATCH_ARRAY_SIZE;
+	uint* param = index + num_indices;
+	uint* handle = param + num_patches * PATCH_PARAM_SIZE;
+
+	uint current_param = 0;
+
+	for(int i = 0; i < num_arrays; i++) {
+		*(array++) = patch_table->GetPatchArrayDescriptor(i).GetType();
+		*(array++) = patch_table->GetNumPatches(i);
+		*(array++) = (index - data) + offset;
+		*(array++) = (param - data) + offset;
+
+		Far::ConstIndexArray indices = patch_table->GetPatchArrayVertices(i);
+
+		for(int j = 0; j < indices.size(); j++) {
+			*(index++) = indices[j];
+		}
+
+		const Far::PatchParamTable& param_table = patch_table->GetPatchParamTable();
+
+		int num_control = patch_table->GetPatchArrayDescriptor(i).GetNumControlVertices();
+		int patches = patch_table->GetNumPatches(i);
+
+		for(int j = 0; j < patches; j++, current_param++) {
+			*(param++) = param_table[current_param].field0;
+			*(param++) = param_table[current_param].field1;
+
+			*(handle++) = (array - data) - PATCH_ARRAY_SIZE + offset;
+			*(handle++) = (param - data) - PATCH_PARAM_SIZE + offset;
+			*(handle++) = j * num_control;
+		}
+	}
+
+	build_patch_map(*this, patch_table, offset);
+#else
+	(void)patch_table;
+	(void)offset;
+#endif
+}
+
+void PackedPatchTable::copy_adjusting_offsets(uint* dest, int doffset)
+{
+	uint* src = &table[0];
+
+	/* arrays */
+	for(int i = 0; i < num_arrays; i++) {
+		*(dest++) = *(src++);
+		*(dest++) = *(src++);
+		*(dest++) = *(src++) + doffset;
+		*(dest++) = *(src++) + doffset;
+	}
+
+	/* indices */
+	for(int i = 0; i < num_indices; i++) {
+		*(dest++) = *(src++);
+	}
+
+	/* params */
+	for(int i = 0; i < num_patches; i++) {
+		*(dest++) = *(src++);
+		*(dest++) = *(src++);
+	}
+
+	/* handles */
+	for(int i = 0; i < num_patches; i++) {
+		*(dest++) = *(src++) + doffset;
+		*(dest++) = *(src++) + doffset;
+		*(dest++) = *(src++);
+	}
+
+	/* nodes */
+	for(int i = 0; i < num_nodes; i++) {
+		*(dest++) = *(src++) + doffset;
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h
new file mode 100644
index 00000000000..c8c7ecf9e47
--- /dev/null
+++ b/intern/cycles/subd/subd_patch_table.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUBD_PATCH_TABLE_H__
+#define __SUBD_PATCH_TABLE_H__
+
+#include "util_types.h"
+#include "util_vector.h"
+
+#ifdef WITH_OPENSUBDIV
+#ifdef _MSC_VER
+#  include "iso646.h"
+#endif
+
+#include <opensubdiv/far/patchTable.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_OPENSUBDIV
+using namespace OpenSubdiv;
+#else
+/* forward declare for when OpenSubdiv is unavailable */
+namespace Far { struct PatchTable; }
+#endif
+
+#define PATCH_ARRAY_SIZE 4
+#define PATCH_PARAM_SIZE 2
+#define PATCH_HANDLE_SIZE 3
+#define PATCH_NODE_SIZE 1
+
+struct PackedPatchTable {
+	vector<uint> table;
+
+	size_t num_arrays;
+	size_t num_indices;
+	size_t num_patches;
+	size_t num_nodes;
+
+	/* calculated size from num_* members */
+	size_t total_size();
+
+	void pack(Far::PatchTable* patch_table, int offset = 0);
+	void copy_adjusting_offsets(uint* dest, int doffset);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __SUBD_PATCH_TABLE_H__ */
+
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index c4af8cc8c43..3c91ad8ab0d 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -40,12 +40,6 @@ void DiagSplit::dispatch(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef)
 	edgefactors_quad.push_back(ef);
 }
 
-void DiagSplit::dispatch(TriangleDice::SubPatch& sub, TriangleDice::EdgeFactors& ef)
-{
-	subpatches_triangle.push_back(sub);
-	edgefactors_triangle.push_back(ef);
-}
-
 float3 DiagSplit::to_world(Patch *patch, float2 uv)
 {
 	float3 P;
@@ -112,34 +106,6 @@ void DiagSplit::partition_edge(Patch *patch, float2 *P, int *t0, int *t1, float2
 	}
 }
 
-static float2 right_to_equilateral(float2 P)
-{
-	static const float2 A = make_float2(1.0f, 0.5f);
-	static const float2 B = make_float2(0.0f, sinf(M_PI_F/3.0f));
-	return make_float2(dot(P, A), dot(P, B));
-}
-
-static void limit_edge_factors(const TriangleDice::SubPatch& sub, TriangleDice::EdgeFactors& ef, int max_t)
-{
-	float2 Pu = sub.Pu;
-	float2 Pv = sub.Pv;
-	float2 Pw = sub.Pw;
-
-	if(sub.patch->is_triangle()) {
-		Pu = right_to_equilateral(Pu);
-		Pv = right_to_equilateral(Pv);
-		Pw = right_to_equilateral(Pw);
-	}
-
-	int tu = int(max_t * len(Pw - Pv));
-	int tv = int(max_t * len(Pw - Pu));
-	int tw = int(max_t * len(Pv - Pu));
-
-	ef.tu = tu <= 1 ? 1 : min(ef.tu, tu);
-	ef.tv = tv <= 1 ? 1 : min(ef.tv, tv);
-	ef.tw = tw <= 1 ? 1 : min(ef.tw, tw);
-}
-
 static void limit_edge_factors(const QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef, int max_t)
 {
 	float2 P00 = sub.P00;
@@ -147,13 +113,6 @@ static void limit_edge_factors(const QuadDice::SubPatch& sub, QuadDice::EdgeFact
 	float2 P10 = sub.P10;
 	float2 P11 = sub.P11;
 
-	if(sub.patch->is_triangle()) {
-		P00 = right_to_equilateral(P00);
-		P01 = right_to_equilateral(P01);
-		P10 = right_to_equilateral(P10);
-		P11 = right_to_equilateral(P11);
-	}
-
 	int tu0 = int(max_t * len(P10 - P00));
 	int tu1 = int(max_t * len(P11 - P01));
 	int tv0 = int(max_t * len(P01 - P00));
@@ -165,84 +124,6 @@ static void limit_edge_factors(const QuadDice::SubPatch& sub, QuadDice::EdgeFact
 	ef.tv1 = tv1 <= 1 ? 1 : min(ef.tv1, tv1);
 }
 
-void DiagSplit::split(TriangleDice::SubPatch& sub, TriangleDice::EdgeFactors& ef, int depth)
-{
-	if(depth > 32) {
-		/* We should never get here, but just in case end recursion safely. */
-		ef.tu = 1;
-		ef.tv = 1;
-		ef.tw = 1;
-
-		dispatch(sub, ef);
-		return;
-	}
-
-	assert(ef.tu == T(sub.patch, sub.Pv, sub.Pw));
-	assert(ef.tv == T(sub.patch, sub.Pw, sub.Pu));
-	assert(ef.tw == T(sub.patch, sub.Pu, sub.Pv));
-
-	int non_uniform_count = int(ef.tu == DSPLIT_NON_UNIFORM) +
-	                        int(ef.tv == DSPLIT_NON_UNIFORM) +
-                            int(ef.tw == DSPLIT_NON_UNIFORM);
-
-	switch(non_uniform_count) {
-		case 1: {
-			/* TODO(mai): one edge is non-uniform, split into two triangles */
-			// fallthru
-		}
-		case 2: {
-			/* TODO(mai): two edges are non-uniform, split into triangle and quad */
-			// fallthru
-		}
-		case 3: {
-			/* all three edges are non-uniform, split into three quads */
-
-			/* partition edges */
-			QuadDice::EdgeFactors ef0, ef1, ef2;
-			float2 Pu, Pv, Pw, Pcenter;
-
-			partition_edge(sub.patch, &Pu, &ef1.tv0, &ef2.tu0, sub.Pw, sub.Pv, ef.tu);
-			partition_edge(sub.patch, &Pv, &ef0.tv0, &ef1.tu0, sub.Pu, sub.Pw, ef.tv);
-			partition_edge(sub.patch, &Pw, &ef2.tv0, &ef0.tu0, sub.Pv, sub.Pu, ef.tw);
-			Pcenter = (Pu + Pv + Pw) * (1.0f / 3.0f);
-
-			/* split */
-			int tsplit01 = T(sub.patch, Pv, Pcenter);
-			int tsplit12 = T(sub.patch, Pu, Pcenter);
-			int tsplit20 = T(sub.patch, Pw, Pcenter);
-
-			ef0.tu1 = tsplit01;
-			ef0.tv1 = tsplit20;
-
-			ef1.tu1 = tsplit12;
-			ef1.tv1 = tsplit01;
-
-			ef2.tu1 = tsplit20;
-			ef2.tv1 = tsplit12;
-
-			/* create subpatches */
-			QuadDice::SubPatch sub0 = {sub.patch, sub.Pu, Pw, Pv, Pcenter};
-			QuadDice::SubPatch sub1 = {sub.patch, sub.Pw, Pv, Pu, Pcenter};
-			QuadDice::SubPatch sub2 = {sub.patch, sub.Pv, Pu, Pw, Pcenter};
-
-			limit_edge_factors(sub0, ef0, 1 << params.max_level);
-			limit_edge_factors(sub1, ef1, 1 << params.max_level);
-			limit_edge_factors(sub2, ef2, 1 << params.max_level);
-
-			split(sub0, ef0, depth+1);
-			split(sub1, ef1, depth+1);
-			split(sub2, ef2, depth+1);
-
-			break;
-		}
-		default: {
-			/* all edges uniform, no splitting needed */
-			dispatch(sub, ef);
-			break;
-		}
-	}
-}
-
 void DiagSplit::split(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef, int depth)
 {
 	if(depth > 32) {
@@ -259,6 +140,16 @@ void DiagSplit::split(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef, int de
 	bool split_u = (ef.tu0 == DSPLIT_NON_UNIFORM || ef.tu1 == DSPLIT_NON_UNIFORM);
 	bool split_v = (ef.tv0 == DSPLIT_NON_UNIFORM || ef.tv1 == DSPLIT_NON_UNIFORM);
 
+	/* Split subpatches such that the ratio of T for opposite edges doesn't
+     * exceed 1.5, this reduces over tessellation for some patches
+	 */
+	bool tmp_split_v = split_v;
+	if(!split_u && min(ef.tu0, ef.tu1) > 8 && min(ef.tu0, ef.tu1)*1.5f < max(ef.tu0, ef.tu1))
+		split_v = true;
+	if(!tmp_split_v && min(ef.tu0, ef.tu1) > 8 && min(ef.tv0, ef.tv1)*1.5f < max(ef.tv0, ef.tv1))
+		split_u = true;
+
+	/* alternate axis */
 	if(split_u && split_v) {
 		split_u = depth % 2;
 	}
@@ -324,69 +215,21 @@ void DiagSplit::split(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef, int de
 	}
 }
 
-void DiagSplit::split_triangle(Patch *patch)
-{
-	TriangleDice::SubPatch sub_split;
-	TriangleDice::EdgeFactors ef_split;
-
-	sub_split.patch = patch;
-	sub_split.Pu = make_float2(1.0f, 0.0f);
-	sub_split.Pv = make_float2(0.0f, 1.0f);
-	sub_split.Pw = make_float2(0.0f, 0.0f);
-
-	ef_split.tu = T(patch, sub_split.Pv, sub_split.Pw);
-	ef_split.tv = T(patch, sub_split.Pw, sub_split.Pu);
-	ef_split.tw = T(patch, sub_split.Pu, sub_split.Pv);
-
-	limit_edge_factors(sub_split, ef_split, 1 << params.max_level);
-
-	split(sub_split, ef_split);
-
-	TriangleDice dice(params);
-
-	for(size_t i = 0; i < subpatches_triangle.size(); i++) {
-		TriangleDice::SubPatch& sub = subpatches_triangle[i];
-		TriangleDice::EdgeFactors& ef = edgefactors_triangle[i];
-
-		ef.tu = max(ef.tu, 1);
-		ef.tv = max(ef.tv, 1);
-		ef.tw = max(ef.tw, 1);
-
-		dice.dice(sub, ef);
-	}
-
-	subpatches_triangle.clear();
-	edgefactors_triangle.clear();
-
-	/* triangle might be split into quads so dice quad subpatches as well */
-	QuadDice qdice(params);
-
-	for(size_t i = 0; i < subpatches_quad.size(); i++) {
-		QuadDice::SubPatch& sub = subpatches_quad[i];
-		QuadDice::EdgeFactors& ef = edgefactors_quad[i];
-
-		ef.tu0 = max(ef.tu0, 1);
-		ef.tu1 = max(ef.tu1, 1);
-		ef.tv0 = max(ef.tv0, 1);
-		ef.tv1 = max(ef.tv1, 1);
-
-		qdice.dice(sub, ef);
-	}
-
-	subpatches_quad.clear();
-	edgefactors_quad.clear();
-}
-
-void DiagSplit::split_quad(Patch *patch)
+void DiagSplit::split_quad(Patch *patch, QuadDice::SubPatch *subpatch)
 {
 	QuadDice::SubPatch sub_split;
 	QuadDice::EdgeFactors ef_split;
 
-	sub_split.patch = patch;
-	sub_split.P00 = make_float2(0.0f, 0.0f);
-	sub_split.P10 = make_float2(1.0f, 0.0f);
-	sub_split.P01 = make_float2(0.0f, 1.0f);
-	sub_split.P11 = make_float2(1.0f, 1.0f);
+	if(subpatch) {
+		sub_split = *subpatch;
+	}
+	else {
+		sub_split.patch = patch;
+		sub_split.P00 = make_float2(0.0f, 0.0f);
+		sub_split.P10 = make_float2(1.0f, 0.0f);
+		sub_split.P01 = make_float2(0.0f, 1.0f);
+		sub_split.P11 = make_float2(1.0f, 1.0f);
+	}
 
 	ef_split.tu0 = T(patch, sub_split.P00, sub_split.P10);
 	ef_split.tu1 = T(patch, sub_split.P01, sub_split.P11);
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index bbe921f739c..a2f76dd2e03 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -38,8 +38,6 @@ class DiagSplit {
 public:
 	vector<QuadDice::SubPatch> subpatches_quad;
 	vector<QuadDice::EdgeFactors> edgefactors_quad;
-	vector<TriangleDice::SubPatch> subpatches_triangle;
-	vector<TriangleDice::EdgeFactors> edgefactors_triangle;
 
 	SubdParams params;
 
@@ -53,11 +51,7 @@ public:
 	void dispatch(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef);
 	void split(QuadDice::SubPatch& sub, QuadDice::EdgeFactors& ef, int depth=0);
 
-	void dispatch(TriangleDice::SubPatch& sub, TriangleDice::EdgeFactors& ef);
-	void split(TriangleDice::SubPatch& sub, TriangleDice::EdgeFactors& ef, int depth=0);
-
-	void split_triangle(Patch *patch);
-	void split_quad(Patch *patch);
+	void split_quad(Patch *patch, QuadDice::SubPatch *subpatch=NULL);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 2f3a4d0b1df..9af777fb9dd 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -14,9 +14,48 @@ endmacro()
 set(INC
 	.
 	..
+	../device
+	../graph
+	../kernel
+	../render
 	../util
 )
 
+set(ALL_CYCLES_LIBRARIES
+	cycles_render
+	cycles_device
+	cycles_bvh
+	cycles_graph
+	cycles_subd
+	cycles_util
+	${OPENIMAGEIO_LIBRARIES}
+)
+if(WITH_CYCLES_OSL)
+	list(APPEND ALL_CYCLES_LIBRARIES
+		cycles_kernel_osl
+		${OSL_LIBRARIES}
+		${LLVM_LIBRARIES}
+	)
+endif()
+if(WITH_IMAGE_OPENJPEG AND NOT WITH_SYSTEM_OPENJPEG)
+	list(APPEND ALL_CYCLES_LIBRARIES
+		extern_openjpeg
+	)
+endif()
+if(WITH_CYCLES_OPENSUBDIV)
+	add_definitions(-DWITH_OPENSUBDIV)
+	include_directories(
+		SYSTEM
+		${OPENSUBDIV_INCLUDE_DIR}
+	)
+	list(APPEND ALL_CYCLES_LIBRARIES
+		${OPENSUBDIV_LIBRARIES}
+	)
+endif()
+list(APPEND ALL_CYCLES_LIBRARIES
+	${BOOST_LIBRARIES}
+)
+
 include_directories(${INC})
 
 link_directories(${BOOST_LIBPATH})
@@ -25,6 +64,7 @@ link_directories(${OPENIMAGEIO_LIBPATH})
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PLATFORM_LINKFLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} ${PLATFORM_LINKFLAGS_DEBUG}")
 
+CYCLES_TEST(render_graph_finalize "${ALL_CYCLES_LIBRARIES}")
 CYCLES_TEST(util_aligned_malloc "cycles_util")
 CYCLES_TEST(util_path "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
 CYCLES_TEST(util_string "cycles_util;${BOOST_LIBRARIES}")
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
new file mode 100644
index 00000000000..633e517ce9f
--- /dev/null
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -0,0 +1,1532 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+#include "testing/mock_log.h"
+
+#include "render/graph.h"
+#include "render/scene.h"
+#include "render/nodes.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+using testing::AnyNumber;
+using testing::HasSubstr;
+using testing::ScopedMockLog;
+using testing::_;
+
+CCL_NAMESPACE_BEGIN
+
+namespace {
+
+template<typename T>
+class ShaderNodeBuilder {
+public:
+	ShaderNodeBuilder(const string& name)
+	  : name_(name)
+	{
+		node_ = new T();
+		node_->name = name;
+	}
+
+	const string& name() const {
+		return name_;
+	}
+
+	ShaderNode *node() const {
+		return node_;
+	}
+
+	template<typename V>
+	ShaderNodeBuilder& set(const string& input_name, V value)
+	{
+		ShaderInput *input_socket = node_->input(input_name.c_str());
+		EXPECT_NE((void*)NULL, input_socket);
+		input_socket->set(value);
+		return *this;
+	}
+
+	template<typename T2, typename V>
+	ShaderNodeBuilder& set(V T2::*pfield, V value)
+	{
+		static_cast<T*>(node_)->*pfield = value;
+		return *this;
+	}
+
+protected:
+	string name_;
+	ShaderNode *node_;
+};
+
+class ShaderGraphBuilder {
+public:
+	explicit ShaderGraphBuilder(ShaderGraph *graph)
+	  : graph_(graph)
+	{
+		node_map_["Output"] = graph->output();
+	}
+
+	ShaderNode *find_node(const string& name)
+	{
+		map<string, ShaderNode *>::iterator it = node_map_.find(name);
+		if(it == node_map_.end()) {
+			return NULL;
+		}
+		return it->second;
+	}
+
+	template<typename T>
+	ShaderGraphBuilder& add_node(const T& node)
+	{
+		EXPECT_EQ(NULL, find_node(node.name()));
+		graph_->add(node.node());
+		node_map_[node.name()] = node.node();
+		return *this;
+	}
+
+	ShaderGraphBuilder& add_connection(const string& from,
+	                                   const string& to)
+	{
+		vector<string> tokens_from, tokens_to;
+		string_split(tokens_from, from, "::");
+		string_split(tokens_to, to, "::");
+		EXPECT_EQ(2, tokens_from.size());
+		EXPECT_EQ(2, tokens_to.size());
+		ShaderNode *node_from = find_node(tokens_from[0]),
+		           *node_to = find_node(tokens_to[0]);
+		EXPECT_NE((void*)NULL, node_from);
+		EXPECT_NE((void*)NULL, node_to);
+		EXPECT_NE(node_from, node_to);
+		ShaderOutput *socket_from = node_from->output(tokens_from[1].c_str());
+		ShaderInput *socket_to = node_to->input(tokens_to[1].c_str());
+		EXPECT_NE((void*)NULL, socket_from);
+		EXPECT_NE((void*)NULL, socket_to);
+		graph_->connect(socket_from, socket_to);
+		return *this;
+	}
+
+	/* Common input/output boilerplate. */
+	ShaderGraphBuilder& add_attribute(const string &name)
+	{
+		return (*this)
+			.add_node(ShaderNodeBuilder<AttributeNode>(name)
+			          .set(&AttributeNode::attribute, ustring(name)));
+	}
+
+	ShaderGraphBuilder& output_closure(const string& from)
+	{
+		return (*this).add_connection(from, "Output::Surface");
+	}
+
+	ShaderGraphBuilder& output_color(const string& from)
+	{
+		return (*this)
+			.add_node(ShaderNodeBuilder<EmissionNode>("EmissionNode"))
+			.add_connection(from, "EmissionNode::Color")
+			.output_closure("EmissionNode::Emission");
+	}
+
+	ShaderGraphBuilder& output_value(const string& from)
+	{
+		return (*this)
+			.add_node(ShaderNodeBuilder<EmissionNode>("EmissionNode"))
+			.add_connection(from, "EmissionNode::Strength")
+			.output_closure("EmissionNode::Emission");
+	}
+
+protected:
+	ShaderGraph *graph_;
+	map<string, ShaderNode *> node_map_;
+};
+
+}  // namespace
+
+#define DEFINE_COMMON_VARIABLES(builder_name, mock_log_name) \
+	util_logging_start(); \
+	util_logging_verbosity_set(1); \
+	ScopedMockLog mock_log_name; \
+	DeviceInfo device_info; \
+	SceneParams scene_params; \
+	Scene scene(scene_params, device_info); \
+	ShaderGraph graph; \
+	ShaderGraphBuilder builder(&graph); \
+
+#define EXPECT_ANY_MESSAGE(log) \
+	EXPECT_CALL(log, Log(_, _, _)).Times(AnyNumber()); \
+
+#define CORRECT_INFO_MESSAGE(log, message) \
+	EXPECT_CALL(log, Log(google::INFO, _, HasSubstr(message)));
+
+#define INVALID_INFO_MESSAGE(log, message) \
+	EXPECT_CALL(log, Log(google::INFO, _, HasSubstr(message))).Times(0);
+
+/*
+ * Test deduplication of nodes that have inputs, some of them folded.
+ */
+TEST(render_graph, deduplicate_deep)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Value1::Value to constant (0.8).");
+	CORRECT_INFO_MESSAGE(log, "Folding Value2::Value to constant (0.8).");
+	CORRECT_INFO_MESSAGE(log, "Deduplicated 2 nodes.");
+
+	builder
+		.add_node(ShaderNodeBuilder<GeometryNode>("Geometry1"))
+		.add_node(ShaderNodeBuilder<GeometryNode>("Geometry2"))
+		.add_node(ShaderNodeBuilder<ValueNode>("Value1")
+		          .set(&ValueNode::value, 0.8f))
+		.add_node(ShaderNodeBuilder<ValueNode>("Value2")
+		          .set(&ValueNode::value, 0.8f))
+		.add_node(ShaderNodeBuilder<NoiseTextureNode>("Noise1"))
+		.add_node(ShaderNodeBuilder<NoiseTextureNode>("Noise2"))
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_BLEND)
+		          .set("Fac", 0.5f))
+		.add_connection("Geometry1::Parametric", "Noise1::Vector")
+		.add_connection("Value1::Value", "Noise1::Scale")
+		.add_connection("Noise1::Color", "Mix::Color1")
+		.add_connection("Geometry2::Parametric", "Noise2::Vector")
+		.add_connection("Value2::Value", "Noise2::Scale")
+		.add_connection("Noise2::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+
+	EXPECT_EQ(graph.nodes.size(), 5);
+}
+
+/*
+ * Test RGB to BW node.
+ */
+TEST(render_graph, constant_fold_rgb_to_bw)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding RGBToBWNodeNode::Val to constant (0.8).");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_float_to_color::value_color to constant (0.8, 0.8, 0.8).");
+
+	builder
+		.add_node(ShaderNodeBuilder<RGBToBWNode>("RGBToBWNodeNode")
+		          .set("Color", make_float3(0.8f, 0.8f, 0.8f)))
+		.output_color("RGBToBWNodeNode::Val");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - folding of Emission nodes that don't emit to nothing.
+ */
+TEST(render_graph, constant_fold_emission1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Discarding closure Emission.");
+
+	builder
+		.add_node(ShaderNodeBuilder<EmissionNode>("Emission")
+		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
+		.output_closure("Emission::Emission");
+
+	graph.finalize(&scene);
+}
+
+TEST(render_graph, constant_fold_emission2)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Discarding closure Emission.");
+
+	builder
+		.add_node(ShaderNodeBuilder<EmissionNode>("Emission")
+		          .set("Strength", 0.0f))
+		.output_closure("Emission::Emission");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - folding of Background nodes that don't emit to nothing.
+ */
+TEST(render_graph, constant_fold_background1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Discarding closure Background.");
+
+	builder
+		.add_node(ShaderNodeBuilder<BackgroundNode>("Background")
+		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
+		.output_closure("Background::Background");
+
+	graph.finalize(&scene);
+}
+
+TEST(render_graph, constant_fold_background2)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Discarding closure Background.");
+
+	builder
+		.add_node(ShaderNodeBuilder<BackgroundNode>("Background")
+		          .set("Strength", 0.0f))
+		.output_closure("Background::Background");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Add Closure with only one input.
+ */
+TEST(render_graph, constant_fold_shader_add)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding AddClosure1::Closure to socket Diffuse::BSDF.");
+	CORRECT_INFO_MESSAGE(log, "Folding AddClosure2::Closure to socket Diffuse::BSDF.");
+	INVALID_INFO_MESSAGE(log, "Folding AddClosure3");
+
+	builder
+		.add_node(ShaderNodeBuilder<DiffuseBsdfNode>("Diffuse"))
+		.add_node(ShaderNodeBuilder<AddClosureNode>("AddClosure1"))
+		.add_node(ShaderNodeBuilder<AddClosureNode>("AddClosure2"))
+		.add_node(ShaderNodeBuilder<AddClosureNode>("AddClosure3"))
+		.add_connection("Diffuse::BSDF", "AddClosure1::Closure1")
+		.add_connection("Diffuse::BSDF", "AddClosure2::Closure2")
+		.add_connection("AddClosure1::Closure", "AddClosure3::Closure1")
+		.add_connection("AddClosure2::Closure", "AddClosure3::Closure2")
+		.output_closure("AddClosure3::Closure");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Mix Closure with 0 or 1 fac.
+ *  - Folding of Mix Closure with both inputs folded to the same node.
+ */
+TEST(render_graph, constant_fold_shader_mix)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding MixClosure1::Closure to socket Diffuse::BSDF.");
+	CORRECT_INFO_MESSAGE(log, "Folding MixClosure2::Closure to socket Diffuse::BSDF.");
+	CORRECT_INFO_MESSAGE(log, "Folding MixClosure3::Closure to socket Diffuse::BSDF.");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<DiffuseBsdfNode>("Diffuse"))
+		/* choose left */
+		.add_node(ShaderNodeBuilder<MixClosureNode>("MixClosure1")
+		          .set("Fac", 0.0f))
+		.add_connection("Diffuse::BSDF", "MixClosure1::Closure1")
+		/* choose right */
+		.add_node(ShaderNodeBuilder<MixClosureNode>("MixClosure2")
+		          .set("Fac", 1.0f))
+		.add_connection("Diffuse::BSDF", "MixClosure2::Closure2")
+		/* both inputs folded the same */
+		.add_node(ShaderNodeBuilder<MixClosureNode>("MixClosure3"))
+		.add_connection("Attribute::Fac", "MixClosure3::Fac")
+		.add_connection("MixClosure1::Closure", "MixClosure3::Closure1")
+		.add_connection("MixClosure2::Closure", "MixClosure3::Closure2")
+		.output_closure("MixClosure3::Closure");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Invert with all constant inputs.
+ */
+TEST(render_graph, constant_fold_invert)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to constant (0.68, 0.5, 0.32).");
+
+	builder
+		.add_node(ShaderNodeBuilder<InvertNode>("Invert")
+		          .set("Fac", 0.8f)
+		          .set("Color", make_float3(0.2f, 0.5f, 0.8f)))
+		.output_color("Invert::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Invert with zero Fac.
+ */
+TEST(render_graph, constant_fold_invert_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket Attribute::Color.");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<InvertNode>("Invert")
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute::Color", "Invert::Color")
+		.output_color("Invert::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of MixRGB Add with all constant inputs (clamp false).
+ */
+TEST(render_graph, constant_fold_mix_add)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1.14, 1.42).");
+
+	builder
+		.add_node(ShaderNodeBuilder<MixNode>("MixAdd")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 0.8f)
+		          .set("Color1", make_float3(0.3, 0.5, 0.7))
+		          .set("Color2", make_float3(0.4, 0.8, 0.9)))
+		.output_color("MixAdd::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of MixRGB Add with all constant inputs (clamp true).
+ */
+TEST(render_graph, constant_fold_mix_add_clamp)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1, 1).");
+
+	builder
+		.add_node(ShaderNodeBuilder<MixNode>("MixAdd")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 0.8f)
+		          .set("Color1", make_float3(0.3, 0.5, 0.7))
+		          .set("Color2", make_float3(0.4, 0.8, 0.9)))
+		.output_color("MixAdd::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - No folding on fac 0 for dodge.
+ */
+TEST(render_graph, constant_fold_part_mix_dodge_no_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding ");
+
+	builder
+		.add_attribute("Attribute1")
+		.add_attribute("Attribute2")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_DODGE)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute1::Color", "Mix::Color1")
+		.add_connection("Attribute2::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - No folding on fac 0 for light.
+ */
+TEST(render_graph, constant_fold_part_mix_light_no_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding ");
+
+	builder
+		.add_attribute("Attribute1")
+		.add_attribute("Attribute2")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_LIGHT)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute1::Color", "Mix::Color1")
+		.add_connection("Attribute2::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - No folding on fac 0 for burn.
+ */
+TEST(render_graph, constant_fold_part_mix_burn_no_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding ");
+
+	builder
+		.add_attribute("Attribute1")
+		.add_attribute("Attribute2")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_BURN)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute1::Color", "Mix::Color1")
+		.add_connection("Attribute2::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - No folding on fac 0 for clamped blend.
+ */
+TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding ");
+
+	builder
+		.add_attribute("Attribute1")
+		.add_attribute("Attribute2")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_BLEND)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute1::Color", "Mix::Color1")
+		.add_connection("Attribute2::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Mix with 0 or 1 Fac.
+ *  - Folding of Mix with both inputs folded to the same node.
+ */
+TEST(render_graph, constant_fold_part_mix_blend)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding MixBlend1::Color to socket Attribute1::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding MixBlend2::Color to socket Attribute1::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding MixBlend3::Color to socket Attribute1::Color.");
+
+	builder
+		.add_attribute("Attribute1")
+		.add_attribute("Attribute2")
+		/* choose left */
+		.add_node(ShaderNodeBuilder<MixNode>("MixBlend1")
+		          .set(&MixNode::type, NODE_MIX_BLEND)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute1::Color", "MixBlend1::Color1")
+		.add_connection("Attribute2::Color", "MixBlend1::Color2")
+		/* choose right */
+		.add_node(ShaderNodeBuilder<MixNode>("MixBlend2")
+		          .set(&MixNode::type, NODE_MIX_BLEND)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Fac", 1.0f))
+		.add_connection("Attribute1::Color", "MixBlend2::Color2")
+		.add_connection("Attribute2::Color", "MixBlend2::Color1")
+		/* both inputs folded to Attribute1 */
+		.add_node(ShaderNodeBuilder<MixNode>("MixBlend3")
+		          .set(&MixNode::type, NODE_MIX_BLEND)
+		          .set(&MixNode::use_clamp, false))
+		.add_connection("Attribute1::Fac", "MixBlend3::Fac")
+		.add_connection("MixBlend1::Color", "MixBlend3::Color1")
+		.add_connection("MixBlend2::Color", "MixBlend3::Color2")
+		.output_color("MixBlend3::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - NOT folding of MixRGB Sub with the same inputs and fac NOT 1.
+ */
+TEST(render_graph, constant_fold_part_mix_sub_same_fac_bad)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding Mix::");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_SUB)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 0.5f))
+		.add_connection("Attribute::Color", "Mix::Color1")
+		.add_connection("Attribute::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of MixRGB Sub with the same inputs and fac 1.
+ */
+TEST(render_graph, constant_fold_part_mix_sub_same_fac_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Mix::Color to constant (0, 0, 0).");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_SUB)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Attribute::Color", "Mix::Color1")
+		.add_connection("Attribute::Color", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Graph for testing partial folds of MixRGB with one constant argument.
+ * Includes 4 tests: constant on each side with fac either unknown or 1.
+ */
+static void build_mix_partial_test_graph(ShaderGraphBuilder &builder, NodeMix type, float3 constval)
+{
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<MixNode>("Mix_Cx_Fx")
+		          .set(&MixNode::type, type)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Color1", constval))
+		.add_node(ShaderNodeBuilder<MixNode>("Mix_Cx_F1")
+		          .set(&MixNode::type, type)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Color1", constval)
+		          .set("Fac", 1.0f))
+		.add_connection("Attribute::Fac", "Mix_Cx_Fx::Fac")
+		.add_connection("Attribute::Color", "Mix_Cx_Fx::Color2")
+		.add_connection("Attribute::Color", "Mix_Cx_F1::Color2")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<MixNode>("Mix_xC_Fx")
+		          .set(&MixNode::type, type)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Color2", constval))
+		.add_node(ShaderNodeBuilder<MixNode>("Mix_xC_F1")
+		          .set(&MixNode::type, type)
+		          .set(&MixNode::use_clamp, false)
+		          .set("Color2", constval)
+		          .set("Fac", 1.0f))
+		.add_connection("Attribute::Fac", "Mix_xC_Fx::Fac")
+		.add_connection("Attribute::Color", "Mix_xC_Fx::Color1")
+		.add_connection("Attribute::Color", "Mix_xC_F1::Color1")
+		/* results of actual tests simply added up to connect to output */
+		.add_node(ShaderNodeBuilder<MixNode>("Out12")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_node(ShaderNodeBuilder<MixNode>("Out34")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_node(ShaderNodeBuilder<MixNode>("Out1234")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Mix_Cx_Fx::Color", "Out12::Color1")
+		.add_connection("Mix_Cx_F1::Color", "Out12::Color2")
+		.add_connection("Mix_xC_Fx::Color", "Out34::Color1")
+		.add_connection("Mix_xC_F1::Color", "Out34::Color2")
+		.add_connection("Out12::Color", "Out1234::Color1")
+		.add_connection("Out34::Color", "Out1234::Color2")
+		.output_color("Out1234::Color");
+}
+
+/*
+ * Tests: partial folding for RGB Add with known 0.
+ */
+TEST(render_graph, constant_fold_part_mix_add_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 0 + X (fac 1) == X */
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color to socket Attribute::Color.");
+	/* X + 0 (fac ?) == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color to socket Attribute::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color to socket Attribute::Color.");
+	INVALID_INFO_MESSAGE(log, "Folding Out");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_ADD, make_float3(0, 0, 0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for RGB Sub with known 0.
+ */
+TEST(render_graph, constant_fold_part_mix_sub_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color");
+	/* X - 0 (fac ?) == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color to socket Attribute::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color to socket Attribute::Color.");
+	INVALID_INFO_MESSAGE(log, "Folding Out");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_SUB, make_float3(0, 0, 0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for RGB Mul with known 1.
+ */
+TEST(render_graph, constant_fold_part_mix_mul_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 1 * X (fac 1) == X */
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color to socket Attribute::Color.");
+	/* X * 1 (fac ?) == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color to socket Attribute::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color to socket Attribute::Color.");
+	INVALID_INFO_MESSAGE(log, "Folding Out");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(1, 1, 1));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for RGB Div with known 1.
+ */
+TEST(render_graph, constant_fold_part_mix_div_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color");
+	INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color");
+	/* X / 1 (fac ?) == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color to socket Attribute::Color.");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color to socket Attribute::Color.");
+	INVALID_INFO_MESSAGE(log, "Folding Out");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(1, 1, 1));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for RGB Mul with known 0.
+ */
+TEST(render_graph, constant_fold_part_mix_mul_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 0 * ? (fac ?) == 0 */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color to constant (0, 0, 0).");
+	/* ? * 0 (fac 1) == 0 */
+	INVALID_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color to constant (0, 0, 0).");
+
+	CORRECT_INFO_MESSAGE(log, "Folding Out12::Color to constant (0, 0, 0).");
+	INVALID_INFO_MESSAGE(log, "Folding Out1234");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(0, 0, 0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for RGB Div with known 0.
+ */
+TEST(render_graph, constant_fold_part_mix_div_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 0 / ? (fac ?) == 0 */
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color to constant (0, 0, 0).");
+	INVALID_INFO_MESSAGE(log, "Folding Mix_xC_Fx::Color");
+	INVALID_INFO_MESSAGE(log, "Folding Mix_xC_F1::Color");
+
+	CORRECT_INFO_MESSAGE(log, "Folding Out12::Color to constant (0, 0, 0).");
+	INVALID_INFO_MESSAGE(log, "Folding Out1234");
+
+	build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(0, 0, 0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Separate/Combine RGB with all constant inputs.
+ */
+TEST(render_graph, constant_fold_separate_combine_rgb)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::R to constant (0.3).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::G to constant (0.5).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::B to constant (0.7).");
+	CORRECT_INFO_MESSAGE(log, "Folding CombineRGB::Image to constant (0.3, 0.5, 0.7).");
+
+	builder
+		.add_node(ShaderNodeBuilder<SeparateRGBNode>("SeparateRGB")
+		          .set("Image", make_float3(0.3f, 0.5f, 0.7f)))
+		.add_node(ShaderNodeBuilder<CombineRGBNode>("CombineRGB"))
+		.add_connection("SeparateRGB::R", "CombineRGB::R")
+		.add_connection("SeparateRGB::G", "CombineRGB::G")
+		.add_connection("SeparateRGB::B", "CombineRGB::B")
+		.output_color("CombineRGB::Image");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Separate/Combine XYZ with all constant inputs.
+ */
+TEST(render_graph, constant_fold_separate_combine_xyz)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::X to constant (0.3).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::Y to constant (0.5).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::Z to constant (0.7).");
+	CORRECT_INFO_MESSAGE(log, "Folding CombineXYZ::Vector to constant (0.3, 0.5, 0.7).");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_color::value_color to constant (0.3, 0.5, 0.7).");
+
+	builder
+		.add_node(ShaderNodeBuilder<SeparateXYZNode>("SeparateXYZ")
+		          .set("Vector", make_float3(0.3f, 0.5f, 0.7f)))
+		.add_node(ShaderNodeBuilder<CombineXYZNode>("CombineXYZ"))
+		.add_connection("SeparateXYZ::X", "CombineXYZ::X")
+		.add_connection("SeparateXYZ::Y", "CombineXYZ::Y")
+		.add_connection("SeparateXYZ::Z", "CombineXYZ::Z")
+		.output_color("CombineXYZ::Vector");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Separate/Combine HSV with all constant inputs.
+ */
+TEST(render_graph, constant_fold_separate_combine_hsv)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::H to constant (0.583333).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::S to constant (0.571429).");
+	CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::V to constant (0.7).");
+	CORRECT_INFO_MESSAGE(log, "Folding CombineHSV::Color to constant (0.3, 0.5, 0.7).");
+
+	builder
+		.add_node(ShaderNodeBuilder<SeparateHSVNode>("SeparateHSV")
+		          .set("Color", make_float3(0.3f, 0.5f, 0.7f)))
+		.add_node(ShaderNodeBuilder<CombineHSVNode>("CombineHSV"))
+		.add_connection("SeparateHSV::H", "CombineHSV::H")
+		.add_connection("SeparateHSV::S", "CombineHSV::S")
+		.add_connection("SeparateHSV::V", "CombineHSV::V")
+		.output_color("CombineHSV::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Gamma with all constant inputs.
+ */
+TEST(render_graph, constant_fold_gamma)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma::Color to constant (0.164317, 0.353553, 0.585662).");
+
+	builder
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma")
+		          .set("Color", make_float3(0.3f, 0.5f, 0.7f))
+		          .set("Gamma", 1.5f))
+		.output_color("Gamma::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: BrightnessContrast with all constant inputs.
+ */
+TEST(render_graph, constant_fold_bright_contrast)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding BrightContrast::Color to constant (0.16, 0.6, 1.04).");
+
+	builder
+		.add_node(ShaderNodeBuilder<BrightContrastNode>("BrightContrast")
+		          .set("Color", make_float3(0.3f, 0.5f, 0.7f))
+		          .set("Bright", 0.1f)
+		          .set("Contrast", 1.2f))
+		.output_color("BrightContrast::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: blackbody with all constant inputs.
+ */
+TEST(render_graph, constant_fold_blackbody)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.94163, 0.226523, 0).");
+
+	builder
+		.add_node(ShaderNodeBuilder<BlackbodyNode>("Blackbody")
+		          .set("Temperature", 1200.0f))
+		.output_color("Blackbody::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Math with all constant inputs (clamp false).
+ */
+TEST(render_graph, constant_fold_math)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1.6).");
+
+	builder
+		.add_node(ShaderNodeBuilder<MathNode>("Math")
+		          .set(&MathNode::type, NODE_MATH_ADD)
+		          .set(&MathNode::use_clamp, false)
+		          .set("Value1", 0.7f)
+		          .set("Value2", 0.9f))
+		.output_value("Math::Value");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Math with all constant inputs (clamp true).
+ */
+TEST(render_graph, constant_fold_math_clamp)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1).");
+
+	builder
+		.add_node(ShaderNodeBuilder<MathNode>("Math")
+		          .set(&MathNode::type, NODE_MATH_ADD)
+		          .set(&MathNode::use_clamp, true)
+		          .set("Value1", 0.7f)
+		          .set("Value2", 0.9f))
+		.output_value("Math::Value");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Graph for testing partial folds of Math with one constant argument.
+ * Includes 2 tests: constant on each side.
+ */
+static void build_math_partial_test_graph(ShaderGraphBuilder &builder, NodeMath type, float constval)
+{
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<MathNode>("Math_Cx")
+		          .set(&MathNode::type, type)
+		          .set(&MathNode::use_clamp, false)
+		          .set("Value1", constval))
+		.add_connection("Attribute::Fac", "Math_Cx::Value2")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<MathNode>("Math_xC")
+		          .set(&MathNode::type, type)
+		          .set(&MathNode::use_clamp, false)
+		          .set("Value2", constval))
+		.add_connection("Attribute::Fac", "Math_xC::Value1")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<MathNode>("Out")
+		          .set(&MathNode::type, NODE_MATH_ADD)
+		          .set(&MathNode::use_clamp, true))
+		.add_connection("Math_Cx::Value", "Out::Value1")
+		.add_connection("Math_xC::Value", "Out::Value2")
+		.output_value("Out::Value");
+}
+
+/*
+ * Tests: partial folding for Math Add with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_add_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X + 0 == 0 + X == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_ADD, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Sub with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_sub_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X - 0 == X */
+	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_SUBTRACT, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Mul with known 1.
+ */
+TEST(render_graph, constant_fold_part_math_mul_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X * 1 == 1 * X == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 1.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Div with known 1.
+ */
+TEST(render_graph, constant_fold_part_math_div_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X / 1 == X */
+	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 1.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Mul with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_mul_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X * 0 == 0 * X == 0 */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to constant (0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Out::Value to constant (0)");
+	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
+
+	build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Div with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_div_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 0 / X == 0 */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
+	INVALID_INFO_MESSAGE(log, "Folding Math_xC::");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Vector Math with all constant inputs.
+ */
+TEST(render_graph, constant_fold_vector_math)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Value to constant (1).");
+	CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Vector to constant (3, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_float::value_float to constant (1).");
+	CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (2).");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_float_to_color::value_color to constant (2, 2, 2).");
+
+	builder
+		.add_node(ShaderNodeBuilder<VectorMathNode>("VectorMath")
+		          .set(&VectorMathNode::type, NODE_VECTOR_MATH_SUBTRACT)
+		          .set("Vector1", make_float3(1.3f, 0.5f, 0.7f))
+		          .set("Vector2", make_float3(-1.7f, 0.5f, 0.7f)))
+		.add_node(ShaderNodeBuilder<MathNode>("Math")
+		          .set(&MathNode::type, NODE_MATH_ADD))
+		.add_connection("VectorMath::Vector", "Math::Value1")
+		.add_connection("VectorMath::Value", "Math::Value2")
+		.output_color("Math::Value");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Graph for testing partial folds of Vector Math with one constant argument.
+ * Includes 2 tests: constant on each side.
+ */
+static void build_vecmath_partial_test_graph(ShaderGraphBuilder &builder, NodeVectorMath type, float3 constval)
+{
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<VectorMathNode>("Math_Cx")
+		          .set(&VectorMathNode::type, type)
+		          .set("Vector1", constval))
+		.add_connection("Attribute::Vector", "Math_Cx::Vector2")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<VectorMathNode>("Math_xC")
+		          .set(&VectorMathNode::type, type)
+		          .set("Vector2", constval))
+		.add_connection("Attribute::Vector", "Math_xC::Vector1")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<VectorMathNode>("Out")
+		          .set(&VectorMathNode::type, NODE_VECTOR_MATH_ADD))
+		.add_connection("Math_Cx::Vector", "Out::Vector1")
+		.add_connection("Math_xC::Vector", "Out::Vector2")
+		.output_color("Out::Vector");
+}
+
+/*
+ * Tests: partial folding for Vector Math Add with known 0.
+ */
+TEST(render_graph, constant_fold_part_vecmath_add_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X + 0 == 0 + X == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to socket Attribute::Vector.");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Vector to socket Attribute::Vector.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_ADD, make_float3(0,0,0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Vector Math Sub with known 0.
+ */
+TEST(render_graph, constant_fold_part_vecmath_sub_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X - 0 == X */
+	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Vector to socket Attribute::Vector.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_SUBTRACT, make_float3(0,0,0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Vector Math Dot Product with known 0.
+ */
+TEST(render_graph, constant_fold_part_vecmath_dot_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X * 0 == 0 * X == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Out::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
+
+	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_DOT_PRODUCT, make_float3(0,0,0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Vector Math Cross Product with known 0.
+ */
+TEST(render_graph, constant_fold_part_vecmath_cross_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X * 0 == 0 * X == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Folding Out::Vector to constant (0, 0, 0).");
+	CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
+
+	build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_CROSS_PRODUCT, make_float3(0,0,0));
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Bump with no height input folded to Normal input.
+ */
+TEST(render_graph, constant_fold_bump)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket Geometry1::Normal.");
+
+	builder
+		.add_node(ShaderNodeBuilder<GeometryNode>("Geometry1"))
+		.add_node(ShaderNodeBuilder<BumpNode>("Bump"))
+		.add_connection("Geometry1::Normal", "Bump::Normal")
+		.output_color("Bump::Normal");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Bump with no inputs folded to Geometry::Normal.
+ */
+TEST(render_graph, constant_fold_bump_no_input)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket geometry::Normal.");
+
+	builder
+		.add_node(ShaderNodeBuilder<BumpNode>("Bump"))
+		.output_color("Bump::Normal");
+
+	graph.finalize(&scene);
+}
+
+template<class T>
+void init_test_curve(array<T> &buffer, T start, T end, int steps)
+{
+	buffer.resize(steps);
+
+	for (int i = 0; i < steps; i++)
+		buffer[i] = lerp(start, end, float(i)/(steps-1));
+}
+
+/*
+ * Tests:
+ *  - Folding of RGB Curves with all constant inputs.
+ */
+TEST(render_graph, constant_fold_rgb_curves)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to constant (0.275, 0.5, 0.475).");
+
+	array<float3> curve;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 1.0f), make_float3(1.0f, 0.75f, 0.0f), 257);
+
+	builder
+		.add_node(ShaderNodeBuilder<RGBCurvesNode>("Curves")
+		          .set(&CurvesNode::curves, curve)
+		          .set(&CurvesNode::min_x, 0.1f)
+		          .set(&CurvesNode::max_x, 0.9f)
+		          .set("Fac", 0.5f)
+		          .set("Color", make_float3(0.3f, 0.5f, 0.7f)))
+		.output_color("Curves::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of RGB Curves with zero Fac.
+ */
+TEST(render_graph, constant_fold_rgb_curves_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to socket Attribute::Color.");
+
+	array<float3> curve;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 1.0f), make_float3(1.0f, 0.75f, 0.0f), 257);
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<RGBCurvesNode>("Curves")
+		          .set(&CurvesNode::curves, curve)
+		          .set(&CurvesNode::min_x, 0.1f)
+		          .set(&CurvesNode::max_x, 0.9f)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute::Color", "Curves::Color")
+		.output_color("Curves::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Vector Curves with all constant inputs.
+ */
+TEST(render_graph, constant_fold_vector_curves)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to constant (0.275, 0.5, 0.475).");
+
+	array<float3> curve;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 1.0f), make_float3(1.0f, 0.75f, 0.0f), 257);
+
+	builder
+		.add_node(ShaderNodeBuilder<VectorCurvesNode>("Curves")
+		          .set(&CurvesNode::curves, curve)
+		          .set(&CurvesNode::min_x, 0.1f)
+		          .set(&CurvesNode::max_x, 0.9f)
+		          .set("Fac", 0.5f)
+		          .set("Vector", make_float3(0.3f, 0.5f, 0.7f)))
+		.output_color("Curves::Vector");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Vector Curves with zero Fac.
+ */
+TEST(render_graph, constant_fold_vector_curves_fac_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to socket Attribute::Vector.");
+
+	array<float3> curve;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 1.0f), make_float3(1.0f, 0.75f, 0.0f), 257);
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<VectorCurvesNode>("Curves")
+		          .set(&CurvesNode::curves, curve)
+		          .set(&CurvesNode::min_x, 0.1f)
+		          .set(&CurvesNode::max_x, 0.9f)
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute::Vector", "Curves::Vector")
+		.output_color("Curves::Vector");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Color Ramp with all constant inputs.
+ */
+TEST(render_graph, constant_fold_rgb_ramp)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.14, 0.39, 0.64).");
+	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.89).");
+
+	array<float3> curve;
+	array<float> alpha;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 0.5f), make_float3(0.25f, 0.5f, 0.75f), 9);
+	init_test_curve(alpha, 0.75f, 1.0f, 9);
+
+	builder
+		.add_node(ShaderNodeBuilder<RGBRampNode>("Ramp")
+		          .set(&RGBRampNode::ramp, curve)
+		          .set(&RGBRampNode::ramp_alpha, alpha)
+		          .set(&RGBRampNode::interpolate, true)
+		          .set("Fac", 0.56f))
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_ADD))
+		.add_connection("Ramp::Color", "Mix::Color1")
+		.add_connection("Ramp::Alpha", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of Color Ramp with all constant inputs (interpolate false).
+ */
+TEST(render_graph, constant_fold_rgb_ramp_flat)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.125, 0.375, 0.625).");
+	CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.875).");
+
+	array<float3> curve;
+	array<float> alpha;
+	init_test_curve(curve, make_float3(0.0f, 0.25f, 0.5f), make_float3(0.25f, 0.5f, 0.75f), 9);
+	init_test_curve(alpha, 0.75f, 1.0f, 9);
+
+	builder
+		.add_node(ShaderNodeBuilder<RGBRampNode>("Ramp")
+		          .set(&RGBRampNode::ramp, curve)
+		          .set(&RGBRampNode::ramp_alpha, alpha)
+		          .set(&RGBRampNode::interpolate, false)
+		          .set("Fac", 0.56f))
+		.add_node(ShaderNodeBuilder<MixNode>("Mix")
+		          .set(&MixNode::type, NODE_MIX_ADD))
+		.add_connection("Ramp::Color", "Mix::Color1")
+		.add_connection("Ramp::Alpha", "Mix::Color2")
+		.output_color("Mix::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of redundant conversion of float to color to float.
+ */
+TEST(render_graph, constant_fold_convert_float_color_float)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket convert_float_to_color::value_color.");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_color_to_float::value_float to socket Attribute::Fac.");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<InvertNode>("Invert")
+		          .set("Fac", 0.0f))
+		.add_connection("Attribute::Fac", "Invert::Color")
+		.output_value("Invert::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - Folding of redundant conversion of color to vector to color.
+ */
+TEST(render_graph, constant_fold_convert_color_vector_color)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding VecAdd::Vector to socket convert_color_to_vector::value_vector.");
+	CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_color::value_color to socket Attribute::Color.");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<VectorMathNode>("VecAdd")
+		          .set(&VectorMathNode::type, NODE_VECTOR_MATH_ADD)
+		          .set("Vector2", make_float3(0,0,0)))
+		.add_connection("Attribute::Color", "VecAdd::Vector1")
+		.output_color("VecAdd::Vector");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests:
+ *  - NOT folding conversion of color to float to color.
+ */
+TEST(render_graph, constant_fold_convert_color_float_color)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding MathAdd::Value to socket convert_color_to_float::value_float.");
+	INVALID_INFO_MESSAGE(log, "Folding convert_float_to_color::");
+
+	builder
+		.add_attribute("Attribute")
+		.add_node(ShaderNodeBuilder<MathNode>("MathAdd")
+		          .set(&MathNode::type, NODE_MATH_ADD)
+		          .set("Value2", 0.0f))
+		.add_connection("Attribute::Color", "MathAdd::Value1")
+		.output_color("MathAdd::Value");
+
+	graph.finalize(&scene);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index cfe6fa65143..89a882d9b9d 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -174,6 +174,11 @@ ccl_device_inline float clamp(float a, float mn, float mx)
 	return min(max(a, mn), mx);
 }
 
+ccl_device_inline float mix(float a, float b, float t)
+{
+    return a + t*(b - a);
+}
+
 #endif
 
 #ifndef __KERNEL_CUDA__
@@ -219,6 +224,11 @@ ccl_device_inline float smoothstepf(float f)
 	return (3.0f*ff - 2.0f*ff*f);
 }
 
+ccl_device_inline int mod(int x, int m)
+{
+	return (x % m + m) % m;
+}
+
 /* Float2 Vector */
 
 #ifndef __KERNEL_OPENCL__
@@ -562,6 +572,12 @@ ccl_device_inline float3 safe_normalize(const float3 a)
 	return (t != 0.0f)? a/t: a;
 }
 
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+{
+	*t = len(a);
+	return (*t != 0.0f)? a/(*t): a;
+}
+
 #ifndef __KERNEL_OPENCL__
 
 ccl_device_inline bool operator==(const float3 a, const float3 b)
@@ -652,6 +668,15 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
 	return a + t*(b - a);
 }
 
+#ifndef __KERNEL_OPENCL__
+
+ccl_device_inline float3 mix(float3 a, float3 b, float t)
+{
+	return a + t*(b - a);
+}
+
+#endif
+
 ccl_device_inline bool is_zero(const float3 a)
 {
 #ifdef __KERNEL_SSE__
@@ -671,6 +696,15 @@ ccl_device_inline float average(const float3 a)
 	return reduce_add(a)*(1.0f/3.0f);
 }
 
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+{
+#ifdef __KERNEL_OPENCL__
+	return all(a == b);
+#else
+	return a == b;
+#endif
+}
+
 /* Float4 Vector */
 
 #ifdef __KERNEL_SSE__
@@ -1449,10 +1483,10 @@ ccl_device bool ray_triangle_intersect(
 	return true;
 }
 
-ccl_device bool ray_triangle_intersect_uv(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 v0, float3 v1, float3 v2,
-	float *isect_u, float *isect_v, float *isect_t)
+ccl_device_inline bool ray_triangle_intersect_uv(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 v0, float3 v1, float3 v2,
+        float *isect_u, float *isect_v, float *isect_t)
 {
 	/* Calculate intersection */
 	float3 e1 = v1 - v0;
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 0c848beaafd..f23f2cb0168 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -728,6 +728,17 @@ bool path_remove(const string& path)
 	return remove(path.c_str()) == 0;
 }
 
+static string line_directive(const string& path, int line)
+{
+	string escaped_path = path;
+	string_replace(escaped_path, "\"", "\\\"");
+	string_replace(escaped_path, "\'", "\\\'");
+	string_replace(escaped_path, "\?", "\\\?");
+	string_replace(escaped_path, "\\", "\\\\");
+	return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
+}
+
+
 string path_source_replace_includes(const string& source, const string& path)
 {
 	/* Our own little c preprocessor that replaces #includes with the file
@@ -737,7 +748,7 @@ string path_source_replace_includes(const string& source, const string& path)
 
 	string result = "";
 	vector<string> lines;
-	string_split(lines, source, "\n");
+	string_split(lines, source, "\n", false);
 
 	for(size_t i = 0; i < lines.size(); ++i) {
 		string line = lines[i];
@@ -759,7 +770,10 @@ string path_source_replace_includes(const string& source, const string& path)
 						text = path_source_replace_includes(
 						        text, path_dirname(filepath));
 						text = path_source_replace_includes(text, path);
-						line = token.replace(0, n_end + 1, "\n" + text + "\n");
+						/* Use line directives for better error messages. */
+						line = line_directive(filepath, 1)
+						     + token.replace(0, n_end + 1, "\n" + text + "\n")
+						     + line_directive(path, i);
 					}
 				}
 			}
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index c1c5a6b084b..5594aa8edb6 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -74,7 +74,10 @@ bool string_iequals(const string& a, const string& b)
 	return false;
 }
 
-void string_split(vector<string>& tokens, const string& str, const string& separators)
+void string_split(vector<string>& tokens,
+                  const string& str,
+                  const string& separators,
+                  bool skip_empty_tokens)
 {
 	size_t token_start = 0, token_length = 0;
 	for(size_t i = 0; i < str.size(); ++i) {
@@ -87,9 +90,9 @@ void string_split(vector<string>& tokens, const string& str, const string& separ
 		}
 		else {
 			/* Current character is a separator,
-			 * append current token to the list (if token is not empty).
+			 * append current token to the list.
 			 */
-			if(token_length > 0) {
+			if(!skip_empty_tokens || token_length > 0) {
 				string token = str.substr(token_start, token_length);
 				tokens.push_back(token);
 			}
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index d3b5248c380..7aeed96f00b 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -39,7 +39,10 @@ using std::istringstream;
 string string_printf(const char *format, ...) PRINTF_ATTRIBUTE;
 
 bool string_iequals(const string& a, const string& b);
-void string_split(vector<string>& tokens, const string& str, const string& separators = "\t ");
+void string_split(vector<string>& tokens,
+                  const string& str,
+                  const string& separators = "\t ",
+                  bool skip_empty_tokens = true);
 void string_replace(string& haystack, const string& needle, const string& other);
 bool string_startswith(const string& s, const char *start);
 bool string_endswith(const string& s, const char *end);
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index 6fed18a3db8..bfc8f55feed 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -323,6 +323,15 @@ ccl_device_inline Transform transform_clear_scale(const Transform& tfm)
 	return ntfm;
 }
 
+ccl_device_inline Transform transform_empty()
+{
+	return make_transform(
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0);
+}
+
 #endif
 
 /* Motion Transform */
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 6f8c3f6f3de..546b17570bb 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -222,6 +222,11 @@ public:
 		return datasize_;
 	}
 
+	T* data()
+	{
+		return data_;
+	}
+
 	const T* data() const
 	{
 		return data_;