41 files changed, 415 insertions, 687 deletions
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 9b72efd90cf..25f91a0caea 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -45,6 +45,8 @@ set(ADDON_FILES
 	addon/ui.py
 )
 
+add_definitions(-DGLEW_STATIC)
+
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}")
 
 add_dependencies(bf_intern_cycles bf_rna)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 5fa497776fe..7a03df4f35a 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1085,6 +1085,8 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
 
         mapping = node.texture_mapping
 
+        layout.prop(mapping, "vector_type", expand=True)
+
         row = layout.row()
 
         row.column().prop(mapping, "translation")
@@ -1178,7 +1180,7 @@ class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
         psys = context.particle_system
-        return CyclesButtonsPanel.poll(context) and psys
+        return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR'
 
     def draw_header(self, context):
         ccscene = context.scene.cycles_curves
@@ -1218,8 +1220,9 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
         ccscene = scene.cycles_curves
-        use_curves = ccscene.use_curves and context.particle_system
-        return CyclesButtonsPanel.poll(context) and use_curves
+        psys = context.particle_system
+        use_curves = ccscene.use_curves and psys
+        return CyclesButtonsPanel.poll(context) and use_curves and psys.settings.type == 'HAIR'
 
     def draw(self, context):
         layout = self.layout
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 57ef9a1005c..1cddc25a22b 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -217,8 +217,8 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 					float3 pcKey;
 					for(int step_no = 0; step_no <= ren_step; step_no++) {
 						float nco[3];
-						b_psys.co_hair(*b_ob, psmd, pa_no, step_no, nco);
-						float3 cKey = make_float3(nco[0],nco[1],nco[2]);
+						b_psys.co_hair(*b_ob, pa_no, step_no, nco);
+						float3 cKey = make_float3(nco[0], nco[1], nco[2]);
 						cKey = transform_point(&itfm, cKey);
 						if(step_no > 0)
 							curve_length += len(cKey - pcKey);
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index caa61e0c5d7..940a923e5af 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -480,7 +480,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 		BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, need_undeformed);
 
 		if(b_mesh) {
-			if(render_layer.use_surfaces && !(hide_tris && experimental)) {
+			if(render_layer.use_surfaces && !hide_tris) {
 				if(cmesh.data && experimental && RNA_boolean_get(&cmesh, "use_subdivision"))
 					create_subd_mesh(mesh, b_mesh, &cmesh, used_shaders);
 				else
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 2007171642f..b576181d890 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -147,6 +147,7 @@ static void get_tex_mapping(TextureMapping *mapping, BL::TexMapping b_mapping)
 	mapping->translation = get_float3(b_mapping.translation());
 	mapping->rotation = get_float3(b_mapping.rotation());
 	mapping->scale = get_float3(b_mapping.scale());
+	mapping->type = (TextureMapping::Type)b_mapping.vector_type();
 
 	mapping->x_mapping = (TextureMapping::Mapping)b_mapping.mapping_x();
 	mapping->y_mapping = (TextureMapping::Mapping)b_mapping.mapping_y();
@@ -161,6 +162,7 @@ static void get_tex_mapping(TextureMapping *mapping, BL::ShaderNodeMapping b_map
 	mapping->translation = get_float3(b_mapping.translation());
 	mapping->rotation = get_float3(b_mapping.rotation());
 	mapping->scale = get_float3(b_mapping.scale());
+	mapping->type = (TextureMapping::Type)b_mapping.vector_type();
 
 	mapping->use_minmax = b_mapping.use_min() || b_mapping.use_max();
 
@@ -320,9 +322,6 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		SubsurfaceScatteringNode *subsurface = new SubsurfaceScatteringNode();
 
 		switch(b_subsurface_node.falloff()) {
-		case BL::ShaderNodeSubsurfaceScattering::falloff_COMPATIBLE:
-			subsurface->closure = CLOSURE_BSSRDF_COMPATIBLE_ID;
-			break;
 		case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
 			subsurface->closure = CLOSURE_BSSRDF_CUBIC_ID;
 			break;
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 337d075cce4..a349ddc5c3f 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -34,6 +34,7 @@
 
 #include "util_debug.h"
 #include "util_foreach.h"
+#include "util_opengl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -494,7 +495,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
 		params.shadingsystem = SessionParams::OSL;
 	
 	/* color managagement */
-	params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
+	params.display_buffer_linear = GLEW_ARB_half_float_pixel && b_engine.support_display_space_shader(b_scene);
 
 	return params;
 }
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 10d4112b57d..eb262a907a4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -56,18 +56,6 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 {
 	pixels_copy_from(rgba, y, w, h);
 
-	GLuint texid;
-	glGenTextures(1, &texid);
-	glBindTexture(GL_TEXTURE_2D, texid);
-	if(rgba.data_type == TYPE_HALF)
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, (void*)rgba.data_pointer);
-	else
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, (void*)rgba.data_pointer);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-	glEnable(GL_TEXTURE_2D);
-	
 	if(transparent) {
 		glEnable(GL_BLEND);
 		glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
@@ -75,30 +63,57 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 
 	glColor3f(1.0f, 1.0f, 1.0f);
 
-	glPushMatrix();
-	glTranslatef(0.0f, (float)dy, 0.0f);
+	if(rgba.data_type == TYPE_HALF) {
+		/* draw half float texture, GLSL shader for display transform assumed to be bound */
+		GLuint texid;
+		glGenTextures(1, &texid);
+		glBindTexture(GL_TEXTURE_2D, texid);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, (void*)rgba.data_pointer);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 
-	glBegin(GL_QUADS);
-	
-	glTexCoord2f(0.0f, 0.0f);
-	glVertex2f(0.0f, 0.0f);
-	glTexCoord2f(1.0f, 0.0f);
-	glVertex2f((float)width, 0.0f);
-	glTexCoord2f(1.0f, 1.0f);
-	glVertex2f((float)width, (float)height);
-	glTexCoord2f(0.0f, 1.0f);
-	glVertex2f(0.0f, (float)height);
+		glEnable(GL_TEXTURE_2D);
+
+		glPushMatrix();
+		glTranslatef(0.0f, (float)dy, 0.0f);
+
+		glBegin(GL_QUADS);
+		
+		glTexCoord2f(0.0f, 0.0f);
+		glVertex2f(0.0f, 0.0f);
+		glTexCoord2f(1.0f, 0.0f);
+		glVertex2f((float)width, 0.0f);
+		glTexCoord2f(1.0f, 1.0f);
+		glVertex2f((float)width, (float)height);
+		glTexCoord2f(0.0f, 1.0f);
+		glVertex2f(0.0f, (float)height);
 
-	glEnd();
+		glEnd();
 
-	glPopMatrix();
+		glPopMatrix();
+
+		glBindTexture(GL_TEXTURE_2D, 0);
+		glDisable(GL_TEXTURE_2D);
+		glDeleteTextures(1, &texid);
+	}
+	else {
+		/* fallback for old graphics cards that don't support GLSL, half float,
+		 * and non-power-of-two textures */
+		glPixelZoom((float)width/(float)w, (float)height/(float)h);
+		glRasterPos2f(0, dy);
+
+		uint8_t *pixels = (uint8_t*)rgba.data_pointer;
+
+		pixels += 4*y*w;
+
+		glDrawPixels(w, h, GL_RGBA, GL_UNSIGNED_BYTE, pixels);
+
+		glRasterPos2f(0.0f, 0.0f);
+		glPixelZoom(1.0f, 1.0f);
+	}
 
 	if(transparent)
 		glDisable(GL_BLEND);
-
-	glBindTexture(GL_TEXTURE_2D, 0);
-	glDisable(GL_TEXTURE_2D);
-	glDeleteTextures(1, &texid);
 }
 
 Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index b5eaa69bf0e..4ce7f6fd729 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -43,7 +43,9 @@ public:
 	CUmodule cuModule;
 	map<device_ptr, bool> tex_interp_map;
 	int cuDevId;
+	int cuDevArchitecture;
 	bool first_error;
+	bool use_texture_storage;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -173,6 +175,7 @@ public:
 	{
 		first_error = true;
 		background = background_;
+		use_texture_storage = true;
 
 		cuDevId = info.num;
 		cuDevice = 0;
@@ -203,6 +206,15 @@ public:
 		if(cuda_error_(result, "cuCtxCreate"))
 			return;
 
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		cuDevArchitecture = major*100 + minor*10;
+
+		/* In order to use full 6GB of memory on Titan cards, use arrays instead
+		 * of textures. On earlier cards this seems slower, but on Titan it is
+		 * actually slightly faster in tests. */
+		use_texture_storage = (cuDevArchitecture < 350);
+
 		cuda_pop_context();
 	}
 
@@ -210,20 +222,17 @@ public:
 	{
 		task_pool.stop();
 
-		cuda_push_context();
-		cuda_assert(cuCtxDetach(cuContext))
+		cuda_assert(cuCtxDestroy(cuContext))
 	}
 
 	bool support_device(bool experimental)
 	{
-		if(!experimental) {
-			int major, minor;
-			cuDeviceComputeCapability(&major, &minor, cuDevId);
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, cuDevId);
 
-			if(major < 2) {
-				cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
-				return false;
-			}
+		if(major < 2) {
+			cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
+			return false;
 		}
 
 		return true;
@@ -275,8 +284,12 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return "";
 		}
+		if(cuda_version < 50) {
+			printf("Unsupported CUDA version %d.%d detected, you need CUDA 5.0.\n", cuda_version/10, cuda_version%10);
+			return "";
+		}
 
-		if(cuda_version != 50)
+		else if(cuda_version > 50)
 			printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* compile */
@@ -285,36 +298,14 @@ public:
 		const int machine = system_cpu_bits();
 		string arch_flags;
 
-		/* build flags depending on CUDA version and arch */
-		if(cuda_version < 50) {
-			/* CUDA 4.x */
-			if(major == 1) {
-				/* sm_1x */
-				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
-			}
-			else if(major == 2) {
-				/* sm_2x */
-				arch_flags = "--maxrregcount=24";
-			}
-			else {
-				/* sm_3x */
-				arch_flags = "--maxrregcount=32";
-			}
+		/* CUDA 5.x build flags for different archs */
+		if(major == 2) {
+			/* sm_2x */
+			arch_flags = "--maxrregcount=32 --use_fast_math";
 		}
-		else {
-			/* CUDA 5.x */
-			if(major == 1) {
-				/* sm_1x */
-				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
-			}
-			else if(major == 2) {
-				/* sm_2x */
-				arch_flags = "--maxrregcount=32 --use_fast_math";
-			}
-			else {
-				/* sm_3x */
-				arch_flags = "--maxrregcount=32 --use_fast_math";
-			}
+		else if(major == 3) {
+			/* sm_3x */
+			arch_flags = "--maxrregcount=32 --use_fast_math";
 		}
 
 		double starttime = time_dt();
@@ -448,90 +439,118 @@ public:
 		CUarray_format_enum format;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
+		bool use_texture = interpolation || use_texture_storage;
 
-		switch(mem.data_type) {
-			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-			case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-			case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-			case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-			default: assert(0); return;
-		}
-
-		CUtexref texref = NULL;
-
-		cuda_push_context();
-		cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
-
-		if(!texref) {
-			cuda_pop_context();
-			return;
-		}
+		if(use_texture) {
 
-		if(interpolation) {
-			CUarray handle = NULL;
-			CUDA_ARRAY_DESCRIPTOR desc;
+			switch(mem.data_type) {
+				case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+				case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+				case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+				case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+				default: assert(0); return;
+			}
 
-			desc.Width = mem.data_width;
-			desc.Height = mem.data_height;
-			desc.Format = format;
-			desc.NumChannels = mem.data_elements;
+			CUtexref texref = NULL;
 
-			cuda_assert(cuArrayCreate(&handle, &desc))
+			cuda_push_context();
+			cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
 
-			if(!handle) {
+			if(!texref) {
 				cuda_pop_context();
 				return;
 			}
 
-			if(mem.data_height > 1) {
-				CUDA_MEMCPY2D param;
-				memset(&param, 0, sizeof(param));
-				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-				param.dstArray = handle;
-				param.srcMemoryType = CU_MEMORYTYPE_HOST;
-				param.srcHost = (void*)mem.data_pointer;
-				param.srcPitch = mem.data_width*dsize*mem.data_elements;
-				param.WidthInBytes = param.srcPitch;
-				param.Height = mem.data_height;
-
-				cuda_assert(cuMemcpy2D(&param))
+			if(interpolation) {
+				CUarray handle = NULL;
+				CUDA_ARRAY_DESCRIPTOR desc;
+
+				desc.Width = mem.data_width;
+				desc.Height = mem.data_height;
+				desc.Format = format;
+				desc.NumChannels = mem.data_elements;
+
+				cuda_assert(cuArrayCreate(&handle, &desc))
+
+				if(!handle) {
+					cuda_pop_context();
+					return;
+				}
+
+				if(mem.data_height > 1) {
+					CUDA_MEMCPY2D param;
+					memset(&param, 0, sizeof(param));
+					param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+					param.dstArray = handle;
+					param.srcMemoryType = CU_MEMORYTYPE_HOST;
+					param.srcHost = (void*)mem.data_pointer;
+					param.srcPitch = mem.data_width*dsize*mem.data_elements;
+					param.WidthInBytes = param.srcPitch;
+					param.Height = mem.data_height;
+
+					cuda_assert(cuMemcpy2D(&param))
+				}
+				else
+					cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+
+				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+
+				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
+				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+
+				mem.device_pointer = (device_ptr)handle;
+
+				stats.mem_alloc(size);
 			}
-			else
-				cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+			else {
+				cuda_pop_context();
 
-			cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+				mem_alloc(mem, MEM_READ_ONLY);
+				mem_copy_to(mem);
 
-			cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
-			cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+				cuda_push_context();
+
+				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
+				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
+				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
+			}
 
-			mem.device_pointer = (device_ptr)handle;
+			if(periodic) {
+				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
+				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
+			}
+			else {
+				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
+				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
+			}
+			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
 
-			stats.mem_alloc(size);
+			cuda_pop_context();
 		}
 		else {
-			cuda_pop_context();
-
 			mem_alloc(mem, MEM_READ_ONLY);
 			mem_copy_to(mem);
 
 			cuda_push_context();
 
-			cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
-			cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
-			cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
-		}
+			CUdeviceptr cumem;
+			size_t cubytes;
 
-		if(periodic) {
-			cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
-			cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
-		}
-		else {
-			cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
-			cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
-		}
-		cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
+			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name))
 
-		cuda_pop_context();
+			if(cubytes == 8) {
+				/* 64 bit device pointer */
+				uint64_t ptr = mem.device_pointer;
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+			}
+			else {
+				/* 32 bit device pointer */
+				uint32_t ptr = (uint32_t)mem.device_pointer;
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+			}
+
+			cuda_pop_context();
+		}
 
 		tex_interp_map[mem.device_pointer] = interpolation;
 	}
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index d723df70c89..11c7bc6f099 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -892,8 +892,16 @@ public:
 
 	void tex_free(device_memory& mem)
 	{
-		if(mem.data_pointer)
+		if(mem.device_pointer) {
+			foreach(const MemMap::value_type& value, mem_map) {
+				if(value.second == mem.device_pointer) {
+					mem_map.erase(value.first);
+					break;
+				}
+			}
+
 			mem_free(mem);
+		}
 	}
 
 	size_t global_size_round_up(int group_size, int global_size)
@@ -1086,7 +1094,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					//task->update_progress(tile);
+					task->update_progress(tile);
 				}
 
 				task->release_tile(tile);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index eaa4e304ebb..56ba0e08743 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -151,36 +151,16 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 
-		# build flags depending on CUDA version and arch
-		if(CUDA_VERSION LESS 50)
-			# CUDA 4.x
-			if(${arch} MATCHES "sm_1[0-9]")
-				# sm_1x
-				set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
-			elseif(${arch} MATCHES "sm_2[0-9]")
-				# sm_2x
-				set(cuda_arch_flags "--maxrregcount=24")
-			else()
-				# sm_3x
-				set(cuda_arch_flags "--maxrregcount=32")
-			endif()
-
-			set(cuda_math_flags "")
-		else()
-			# CUDA 5.x
-			if(${arch} MATCHES "sm_1[0-9]")
-				# sm_1x
-				set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
-			elseif(${arch} MATCHES "sm_2[0-9]")
-				# sm_2x
-				set(cuda_arch_flags "--maxrregcount=32")
-			else()
-				# sm_3x
-				set(cuda_arch_flags "--maxrregcount=32")
-			endif()
-
-			set(cuda_math_flags "--use_fast_math")
+		# CUDA 5.x build flags for different archs
+		if(${arch} MATCHES "sm_2[0-9]")
+			# sm_2x
+			set(cuda_arch_flags "--maxrregcount=32")
+		elseif(${arch} MATCHES "sm_3[0-9]")
+			# sm_3x
+			set(cuda_arch_flags "--maxrregcount=32")
 		endif()
+
+		set(cuda_math_flags "--use_fast_math")
 		
 		if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
 			message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping")
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 6459c3ed183..a0522d9ba8e 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -86,33 +86,13 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-		# build flags depending on CUDA version and arch
-        if cuda_version < 50:
-            if arch == "sm_35":
-                print("Can't build kernel for CUDA sm_35 architecture, skipping")
-                continue
-
-            # CUDA 4.x
-            if arch.startswith("sm_1"):
-                # sm_1x
-                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0"
-            elif arch.startswith("sm_2"):
-                # sm_2x
-                cuda_arch_flags = "--maxrregcount=24"
-            else:
-                # sm_3x
-                cuda_arch_flags = "--maxrregcount=32"
-        else:
-            # CUDA 5.x
-            if arch.startswith("sm_1"):
-                # sm_1x
-                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math"
-            elif arch.startswith("sm_2"):
-                # sm_2x
-                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
-            else:
-                # sm_3x
-                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+        # CUDA 5.x build flags for different archs
+        if arch.startswith("sm_2"):
+            # sm_2x
+            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+        elif arch.startswith("sm_3"):
+            # sm_3x
+            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
 
         command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
 
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 4b01f2eebcd..44a9822c103 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -809,11 +809,16 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
 #include "kernel_bvh_subsurface.h"
 #endif
 
-
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
 #ifdef __HAIR__ 
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
 #else
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
 #endif
 {
 #ifdef __OBJECT_MOTION__
@@ -851,8 +856,14 @@ __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const ui
 #endif /* __KERNEL_CPU__ */
 }
 
+/* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
-__device_inline uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
+uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index cb86ce8c4ae..44c2b9effe9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -57,7 +57,18 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Macros to handle different memory storage on different devices */
 
+/* In order to use full 6GB of memory on Titan cards, use arrays instead
+ * of textures. On earlier cards this seems slower, but on Titan it is
+ * actually slightly faster in tests. */
+#if __CUDA_ARCH__ < 350
+#define __KERNEL_CUDA_TEX_STORAGE__
+#endif
+
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+#else
+#define kernel_tex_fetch(t, index) t[(index)]
+#endif
 #define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
 
 #define kernel_data __data
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index 8868a335037..721eceabc37 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -70,8 +70,8 @@ __device void kernel_film_convert_to_half_float(KernelGlobals *kg,
 	/* buffer offset */
 	int index = offset + x + y*stride;
 
-	float4 *in = (float4*)(buffer + index*kernel_data.film.pass_stride);
-	half *out = (half*)rgba + index*4;
+	__global float4 *in = (__global float4*)(buffer + index*kernel_data.film.pass_stride);
+	__global half *out = (__global half*)rgba + index*4;
 	float scale = kernel_data.film.exposure*sample_scale;
 
 	float4_store_half(out, in, scale);
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index ab0a717b592..b5e691eb615 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -66,7 +66,11 @@ typedef struct KernelGlobals {
 __constant__ KernelData __data;
 typedef struct KernelGlobals {} KernelGlobals;
 
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #define KERNEL_TEX(type, ttype, name) ttype name;
+#else
+#define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
+#endif
 #define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
 #include "kernel_textures.h"
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1afe8480616..5354738d378 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -266,14 +266,9 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 			if(sc) {
 				uint lcg_state = lcg_init(*rng + rng_offset + sample*0x68bc21eb);
 
-				if(old_subsurface_scatter_use(&sd)) {
-					old_subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, false);
-				}
-				else {
-					float bssrdf_u, bssrdf_v;
-					path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-					subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-				}
+				float bssrdf_u, bssrdf_v;
+				path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
 
 				state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			}
@@ -468,7 +463,9 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 
 	float min_ray_pdf = FLT_MAX;
 	float ray_pdf = 0.0f;
+#ifdef __LAMP_MIS__
 	float ray_t = 0.0f;
+#endif
 	PathState state;
 	int rng_offset = PRNG_BASE_NUM;
 #ifdef __CMJ__
@@ -662,41 +659,35 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 			if(sc) {
 				uint lcg_state = lcg_init(*rng + rng_offset + sample*0x68bc21eb);
 
-				if(old_subsurface_scatter_use(&sd)) {
-					old_subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, false);
-				}
-				else {
-					ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-					float bssrdf_u, bssrdf_v;
-					path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-					int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-
-					/* compute lighting with the BSDF closure */
-					for(int hit = 0; hit < num_hits; hit++) {
-						float3 tp = throughput;
-						PathState hit_state = state;
-						Ray hit_ray = ray;
-						float hit_ray_t = ray_t;
-						float hit_ray_pdf = ray_pdf;
-						float hit_min_ray_pdf = min_ray_pdf;
-
-						hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
-						
-						if(kernel_path_integrate_lighting(kg, rng, sample, num_samples, &bssrdf_sd[hit],
-							&tp, &hit_min_ray_pdf, &hit_ray_pdf, &hit_state, rng_offset+PRNG_BOUNCE_NUM, &L, &hit_ray, &hit_ray_t)) {
-							kernel_path_indirect(kg, rng, sample, hit_ray, buffer,
-								tp, num_samples, num_samples,
-								hit_min_ray_pdf, hit_ray_pdf, hit_state, rng_offset+PRNG_BOUNCE_NUM*2, &L);
-
-							/* for render passes, sum and reset indirect light pass variables
-							 * for the next samples */
-							path_radiance_sum_indirect(&L);
-							path_radiance_reset_indirect(&L);
-						}
+				ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+				float bssrdf_u, bssrdf_v;
+				path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+				int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+
+				/* compute lighting with the BSDF closure */
+				for(int hit = 0; hit < num_hits; hit++) {
+					float3 tp = throughput;
+					PathState hit_state = state;
+					Ray hit_ray = ray;
+					float hit_ray_t = ray_t;
+					float hit_ray_pdf = ray_pdf;
+					float hit_min_ray_pdf = min_ray_pdf;
+
+					hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
+					
+					if(kernel_path_integrate_lighting(kg, rng, sample, num_samples, &bssrdf_sd[hit],
+						&tp, &hit_min_ray_pdf, &hit_ray_pdf, &hit_state, rng_offset+PRNG_BOUNCE_NUM, &L, &hit_ray, &hit_ray_t)) {
+						kernel_path_indirect(kg, rng, sample, hit_ray, buffer,
+							tp, num_samples, num_samples,
+							hit_min_ray_pdf, hit_ray_pdf, hit_state, rng_offset+PRNG_BOUNCE_NUM*2, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
 					}
-
-					break;
 				}
+				break;
 			}
 		}
 #endif
@@ -1129,17 +1120,6 @@ __device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int
 				/* do subsurface scatter step with copy of shader data, this will
 				 * replace the BSSRDF with a diffuse BSDF closure */
 				for(int j = 0; j < num_samples; j++) {
-					if(old_subsurface_scatter_use(&sd)) {
-						ShaderData bssrdf_sd = sd;
-						old_subsurface_scatter_step(kg, &bssrdf_sd, state.flag, sc, &lcg_state, true);
-
-						/* compute lighting with the BSDF closure */
-						kernel_branched_path_integrate_lighting(kg, rng, sample*num_samples + j,
-							aa_samples*num_samples,
-							&bssrdf_sd, throughput, num_samples_inv,
-							ray_pdf, ray_pdf, state, rng_offset, &L, buffer);
-					}
-					else {
 						ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
 						float bssrdf_u, bssrdf_v;
 						path_rng_2D(kg, &bssrdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
@@ -1151,7 +1131,6 @@ __device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int
 								aa_samples*num_samples,
 								&bssrdf_sd[hit], throughput, num_samples_inv,
 								ray_pdf, ray_pdf, state, rng_offset+PRNG_BOUNCE_NUM, &L, buffer);
-					}
 				}
 
 				state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
diff --git a/intern/cycles/kernel/kernel_primitive.h b/intern/cycles/kernel/kernel_primitive.h
index 4a06dff84bf..636cfd06532 100644
--- a/intern/cycles/kernel/kernel_primitive.h
+++ b/intern/cycles/kernel/kernel_primitive.h
@@ -93,7 +93,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
 	if(sd->segment != ~0)
+#ifdef __DPDU__
 		return normalize(sd->dPdu);
+#else
+		return make_float3(0.0f, 0.0f, 0.0f);
+#endif
 #endif
 
 	/* try to create spherical tangent from generated coordinates */
@@ -108,7 +112,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 	}
 	else {
 		/* otherwise use surface derivatives */
+#ifdef __DPDU__
 		return normalize(sd->dPdu);
+#else
+		return make_float3(0.0f, 0.0f, 0.0f);
+#endif
 	}
 }
 
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index ee71a0cfcf4..81630caed9a 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -36,15 +36,8 @@ CCL_NAMESPACE_BEGIN
 /* ShaderData setup from incoming ray */
 
 #ifdef __OBJECT_MOTION__
-#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
-__device_noinline
-#else
-__device
-#endif
-void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
+__device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	/* note that this is a separate non-inlined function to work around crash
-	 * on CUDA sm 2.0, otherwise kernel execution crashes (compiler bug?) */
 	if(sd->flag & SD_OBJECT_MOTION) {
 		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
 		sd->ob_itfm= transform_quick_inverse(sd->ob_tfm);
@@ -56,12 +49,7 @@ void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float tim
 }
 #endif
 
-#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
-__device_noinline
-#else
-__device
-#endif
-void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
+__device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce)
 {
 #ifdef __INSTANCING__
@@ -249,12 +237,7 @@ __device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData
 
 /* ShaderData setup from position sampled on mesh */
 
-#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
-__device_noinline
-#else
-__device
-#endif
-void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
+__device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	const float3 P, const float3 Ng, const float3 I,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int segment)
 {
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 862626d6899..9c69e519dca 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -20,7 +20,10 @@
 
 #ifdef WITH_OPTIMIZED_KERNEL
 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #define __KERNEL_SSE2__
+#endif
 
 #include "kernel.h"
 #include "kernel_compat_cpu.h"
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index c44098606a5..05877a41b4a 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -20,9 +20,12 @@
 
 #ifdef WITH_OPTIMIZED_KERNEL
 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #define __KERNEL_SSE2__
 #define __KERNEL_SSE3__
 #define __KERNEL_SSSE3__
+#endif
 
 #include "kernel.h"
 #include "kernel_compat_cpu.h"
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 4567f2ff0ce..d16b9328bf2 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -403,164 +403,5 @@ __device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
 	subsurface_scatter_setup_diffuse_bsdf(sd, eval, (num_hits > 0), N);
 }
 
-
-/* OLD BSSRDF */
-
-__device float old_bssrdf_sample_distance(KernelGlobals *kg, float radius, float refl, float u)
-{
-	int table_offset = kernel_data.bssrdf.table_offset;
-	float r = lookup_table_read_2D(kg, u, refl, table_offset, BSSRDF_RADIUS_TABLE_SIZE, BSSRDF_REFL_TABLE_SIZE);
-
-	return r*radius;
-}
-
-#ifdef BSSRDF_MULTI_EVAL
-__device float old_bssrdf_pdf(KernelGlobals *kg, float radius, float refl, float r)
-{
-	if(r >= radius)
-		return 0.0f;
-
-	/* todo: when we use the real BSSRDF this will need to be divided by the maximum
-	 * radius instead of the average radius */
-	float t = r/radius;
-
-	int table_offset = kernel_data.bssrdf.table_offset + BSSRDF_PDF_TABLE_OFFSET;
-	float pdf = lookup_table_read_2D(kg, t, refl, table_offset, BSSRDF_RADIUS_TABLE_SIZE, BSSRDF_REFL_TABLE_SIZE);
-
-	pdf /= radius;
-
-	return pdf;
-}
-#endif
-
-#ifdef BSSRDF_MULTI_EVAL
-__device float3 old_subsurface_scatter_multi_eval(KernelGlobals *kg, ShaderData *sd, bool hit, float refl, float *r, int num_r, bool all)
-{
-	/* compute pdf */
-	float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
-	float pdf_sum = 0.0f;
-	float sample_weight_sum = 0.0f;
-	int num_bssrdf = 0;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			float sample_weight = (all)? 1.0f: sc->sample_weight;
-
-			/* compute pdf */
-			float pdf = 1.0f;
-			for(int i = 0; i < num_r; i++)
-				pdf *= old_bssrdf_pdf(kg, sc->data0, refl, r[i]);
-
-			eval_sum += sc->weight*pdf;
-			pdf_sum += sample_weight*pdf;
-
-			sample_weight_sum += sample_weight;
-			num_bssrdf++;
-		}
-	}
-
-	float inv_pdf_sum;
-	
-	if(pdf_sum > 0.0f) {
-		/* in case of branched path integrate we sample all bssrdf's once,
-		 * for path trace we pick one, so adjust pdf for that */
-		if(all)
-			inv_pdf_sum = 1.0f/pdf_sum;
-		else
-			inv_pdf_sum = sample_weight_sum/pdf_sum;
-	}
-	else
-		inv_pdf_sum = 0.0f;
-
-	float3 weight = eval_sum * inv_pdf_sum;
-
-	return weight;
-}
-#endif
-
-/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-__device void old_subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, int state_flag, ShaderClosure *sc, uint *lcg_state, bool all)
-{
-	float radius = sc->data0;
-	float refl = max(average(sc->weight)*3.0f, 0.0f);
-	float r = 0.0f;
-	bool hit = false;
-	float3 weight = make_float3(1.0f, 1.0f, 1.0f);
-#ifdef BSSRDF_MULTI_EVAL
-	float r_attempts[BSSRDF_MAX_ATTEMPTS];
-#endif
-	int num_attempts;
-
-	/* attempt to find a hit a given number of times before giving up */
-	for(num_attempts = 0; num_attempts < kernel_data.bssrdf.num_attempts; num_attempts++) {
-		/* random numbers for sampling */
-		float u1 = lcg_step_float(lcg_state);
-		float u2 = lcg_step_float(lcg_state);
-		float u3 = lcg_step_float(lcg_state);
-		float u4 = lcg_step_float(lcg_state);
-		float u5 = lcg_step_float(lcg_state);
-
-		r = old_bssrdf_sample_distance(kg, radius, refl, u5);
-#ifdef BSSRDF_MULTI_EVAL
-		r_attempts[num_attempts] = r;
-#endif
-
-		float3 p1 = sd->P + sample_uniform_sphere(u1, u2)*r;
-		float3 p2 = sd->P + sample_uniform_sphere(u3, u4)*r;
-
-		/* create ray */
-		Ray ray;
-		ray.P = p1;
-		ray.D = normalize_len(p2 - p1, &ray.t);
-		ray.dP = sd->dP;
-		ray.dD = differential3_zero();
-		ray.time = sd->time;
-
-		/* intersect with the same object. if multiple intersections are
-		 * found it will randomly pick one of them */
-		Intersection isect;
-		if(scene_intersect_subsurface(kg, &ray, &isect, sd->object, lcg_state, 1) == 0)
-			continue;
-
-		/* setup new shading point */
-		shader_setup_from_subsurface(kg, sd, &isect, &ray);
-
-		hit = true;
-		num_attempts++;
-		break;
-	}
-
-	/* evaluate subsurface scattering closures */
-#ifdef BSSRDF_MULTI_EVAL
-	weight *= old_subsurface_scatter_multi_eval(kg, sd, hit, refl, r_attempts, num_attempts, all);
-#else
-	weight *= sc->weight;
-#endif
-
-	if(!hit)
-		weight = make_float3(0.0f, 0.0f, 0.0f);
-
-	/* optionally blur colors and bump mapping */
-	float3 N = sd->N;
-	subsurface_color_bump_blur(kg, sd, sd, state_flag, &weight, &N);
-
-	/* replace closures with a single diffuse BSDF */
-	subsurface_scatter_setup_diffuse_bsdf(sd, weight, hit, N);
-}
-
-__device bool old_subsurface_scatter_use(ShaderData *sd)
-{
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(sc->type == CLOSURE_BSSRDF_COMPATIBLE_ID)
-			return true;
-	}
-
-	return false;
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index d039b708bd4..3371c580c71 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -35,12 +35,7 @@ CCL_NAMESPACE_BEGIN
 #define PARTICLE_SIZE 		5
 #define TIME_INVALID		FLT_MAX
 
-#define BSSRDF_RADIUS_TABLE_SIZE	1024
-#define BSSRDF_REFL_TABLE_SIZE		256
-#define BSSRDF_PDF_TABLE_OFFSET		(BSSRDF_RADIUS_TABLE_SIZE*BSSRDF_REFL_TABLE_SIZE)
-#define BSSRDF_LOOKUP_TABLE_SIZE	(BSSRDF_RADIUS_TABLE_SIZE*BSSRDF_REFL_TABLE_SIZE*2)
 #define BSSRDF_MIN_RADIUS			1e-8f
-#define BSSRDF_MAX_ATTEMPTS			8
 #define BSSRDF_MAX_HITS				4
 
 #define BB_DRAPPER				800.0f
@@ -815,12 +810,6 @@ typedef struct KernelCurves {
 	int pad2;
 } KernelCurves;
 
-typedef struct KernelBSSRDF {
-	int table_offset;
-	int num_attempts;
-	int pad1, pad2;
-} KernelBSSRDF;
-
 typedef struct KernelBlackbody {
 	int table_offset;
 	int pad1, pad2, pad3;
@@ -834,7 +823,6 @@ typedef struct KernelData {
 	KernelIntegrator integrator;
 	KernelBVH bvh;
 	KernelCurves curve;
-	KernelBSSRDF bssrdf;
 	KernelBlackbody blackbody;
 } KernelData;
 
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 625ad263f7f..18e8fee4348 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -253,7 +253,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 						/* disable in case of diffuse ancestor, can't see it well then and
 						 * adds considerably noise due to probabilities of continuing path
 						 * getting lower and lower */
-						if(sc.type != CLOSURE_BSSRDF_COMPATIBLE_ID && (path_flag & PATH_RAY_DIFFUSE_ANCESTOR))
+						if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 							bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
 
 						/* create one closure for each color channel */
diff --git a/intern/cycles/kernel/shaders/SConscript b/intern/cycles/kernel/shaders/SConscript
index aad6e23e4d4..8bc1c2206e0 100644
--- a/intern/cycles/kernel/shaders/SConscript
+++ b/intern/cycles/kernel/shaders/SConscript
@@ -57,7 +57,7 @@ if env['WITH_BF_CYCLES_OSL']:
             osl_file = os.path.join(source_dir, f)
             oso_file = os.path.join(build_dir, f.replace('.osl', '.oso'))
 
-            command = "%s -q -O2 -I%s -o %s %s" % (osl_compiler, source_dir, oso_file, osl_file)
+            command = "\"%s\" -q -O2 -I\"%s\" -o \"%s\" \"%s\"" % (osl_compiler, source_dir, oso_file, osl_file)
 
             shaders.Command(oso_file, f, command)
             shaders.Depends(oso_file, [f] + dependencies)
diff --git a/intern/cycles/kernel/shaders/node_environment_texture.osl b/intern/cycles/kernel/shaders/node_environment_texture.osl
index 6b61c689066..136ccdf8b18 100644
--- a/intern/cycles/kernel/shaders/node_environment_texture.osl
+++ b/intern/cycles/kernel/shaders/node_environment_texture.osl
@@ -19,7 +19,7 @@
 
 vector environment_texture_direction_to_equirectangular(vector dir)
 {
-	float u = -atan2(dir[1], dir[0]) / (2.0 * M_PI) + 0.5;
+	float u = -atan2(dir[1], dir[0]) / (M_2PI) + 0.5;
 	float v = atan2(dir[2], hypot(dir[0], dir[1])) / M_PI + 0.5;
 
 	return vector(u, v, 0.0);
diff --git a/intern/cycles/kernel/shaders/node_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
index d1d7d0fb6a6..54d4cb67c3b 100644
--- a/intern/cycles/kernel/shaders/node_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
@@ -19,8 +19,8 @@
 #include "stdosl.h"
 
 shader node_hair_bsdf(
-    color Color = 0.8,
-    string component = "Reflection",
+	color Color = 0.8,
+	string component = "Reflection",
 	float Offset = 0.0,
 	float RoughnessU = 0.1,
 	float RoughnessV = 1.0,
@@ -28,29 +28,30 @@ shader node_hair_bsdf(
 	output closure color BSDF = 0)
 {
 	float IsStrand;
-	float roughnessh = clamp(RoughnessU, 0.001,1.0);
-	float roughnessv = clamp(RoughnessV, 0.001,1.0);
+	float roughnessh = clamp(RoughnessU, 0.001, 1.0);
+	float roughnessv = clamp(RoughnessV, 0.001, 1.0);
 	getattribute("geom:is_curve", IsStrand);
 
-    if (!IsStrand) {
-		if (backfacing())
-        	BSDF = transparent();
-    	else {
-    		if (component == "Reflection")
-    			BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
-    		else
-    			BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
+	if (!IsStrand) {
+		if (backfacing()) {
+			BSDF = transparent();
+		}
+		else {
+			if (component == "Reflection")
+				BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
+			else
+				BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, normalize(dPdv), 0.0);
 		}
 	}
-    else {
-		if (backfacing())
-        	BSDF = transparent();
-    	else {
-    		if (component == "Reflection")
-    			BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, dPdu, -Offset);
-    		else
-    			BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, dPdu, -Offset);
+	else {
+		if (backfacing()) {
+			BSDF = transparent();
+		}
+		else {
+			if (component == "Reflection")
+				BSDF = Color * hair_reflection(Normal, roughnessh, roughnessv, dPdu, -Offset);
+			else
+				BSDF = Color * hair_transmission(Normal, roughnessh, roughnessv, dPdu, -Offset);
 		}
 	}
 }
-
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 3ed791fdc92..85c2dbdb2c2 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -44,8 +44,8 @@ float sky_perez_function(float lam[9], float theta, float gamma)
 }
 
 color sky_radiance_old(normal dir,
-                   float sunphi, float suntheta, color radiance,
-                   float config_x[9], float config_y[9], float config_z[9])
+                       float sunphi, float suntheta, color radiance,
+                       float config_x[9], float config_y[9], float config_z[9])
 {
 	/* convert vector to spherical coordinates */
 	vector spherical = sky_spherical_coordinates(dir);
@@ -76,7 +76,7 @@ float sky_radiance_internal(float config[9], float theta, float gamma)
 	
 	float expM = exp(config[4] * gamma);
 	float rayM = cgamma * cgamma;
-	float mieM = (1.0 + rayM) / pow((1.0 + config[8]*config[8] - 2.0*config[8]*cgamma), 1.5);
+	float mieM = (1.0 + rayM) / pow((1.0 + config[8] * config[8] - 2.0 * config[8] * cgamma), 1.5);
 	float zenith = sqrt(ctheta);
 
 	return (1.0 + config[0] * exp(config[1] / (ctheta + 0.01))) *
@@ -84,8 +84,8 @@ float sky_radiance_internal(float config[9], float theta, float gamma)
 }
 
 color sky_radiance_new(normal dir,
-                   float sunphi, float suntheta, color radiance,
-                   float config_x[9], float config_y[9], float config_z[9])
+                       float sunphi, float suntheta, color radiance,
+                       float config_x[9], float config_y[9], float config_z[9])
 {
 	/* convert vector to spherical coordinates */
 	vector spherical = sky_spherical_coordinates(dir);
@@ -104,7 +104,7 @@ color sky_radiance_new(normal dir,
 	float z = sky_radiance_internal(config_z, theta, gamma) * radiance[2];
 
 	/* convert to RGB and adjust strength */
-	return xyz_to_rgb(x, y, z) * (M_2PI/683);
+	return xyz_to_rgb(x, y, z) * (M_2PI / 683);
 }
 
 shader node_sky_texture(
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index 5c1d1be0ee7..1c0cd74c0be 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -26,7 +26,7 @@ shader node_subsurface_scattering(
 	normal Normal = N,
 	output closure color BSSRDF = 0)
 {
-	if(Falloff == "Gaussian")
+	if (Falloff == "Gaussian")
 		BSSRDF = Color * bssrdf_gaussian(N, Scale * Radius, TextureBlur);
 	else
 		BSSRDF = Color * bssrdf_cubic(N, Scale * Radius, TextureBlur, Sharpness);
diff --git a/intern/cycles/kernel/shaders/node_ward_bsdf.osl b/intern/cycles/kernel/shaders/node_ward_bsdf.osl
index a21dd24059d..2d360d594f2 100644
--- a/intern/cycles/kernel/shaders/node_ward_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_ward_bsdf.osl
@@ -29,7 +29,7 @@ shader node_ward_bsdf(
 	vector T = Tangent;
 
 	if (Rotation != 0.0)
-		T = rotate(T, Rotation * 2.0 * M_PI, point(0.0, 0.0, 0.0), Normal);
+		T = rotate(T, Rotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
 
 	/* compute roughness */
 	float RoughnessU, RoughnessV;
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 424ca335903..47c5dc27f2b 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -134,12 +134,16 @@ normal mod (normal a, float  b) { return a - b*floor(a/b); }
 color  mod (color  a, float  b) { return a - b*floor(a/b); }
 float  mod (float  a, float  b) { return a - b*floor(a/b); }
 PERCOMP2 (min)
+int min (int a, int b) BUILTIN;
 PERCOMP2 (max)
+int max (int a, int b) BUILTIN;
 normal clamp (normal x, normal minval, normal maxval) { return max(min(x,maxval),minval); }
 vector clamp (vector x, vector minval, vector maxval) { return max(min(x,maxval),minval); }
 point  clamp (point x, point minval, point maxval) { return max(min(x,maxval),minval); }
 color  clamp (color x, color minval, color maxval) { return max(min(x,maxval),minval); }
 float  clamp (float x, float minval, float maxval) { return max(min(x,maxval),minval); }
+int    clamp (int x, int minval, int maxval) { return max(min(x,maxval),minval); }
+#if 0
 normal mix (normal x, normal y, normal a) { return x*(1-a) + y*a; }
 normal mix (normal x, normal y, float  a) { return x*(1-a) + y*a; }
 vector mix (vector x, vector y, vector a) { return x*(1-a) + y*a; }
@@ -149,6 +153,17 @@ point  mix (point  x, point  y, float  a) { return x*(1-a) + y*a; }
 color  mix (color  x, color  y, color  a) { return x*(1-a) + y*a; }
 color  mix (color  x, color  y, float  a) { return x*(1-a) + y*a; }
 float  mix (float  x, float  y, float  a) { return x*(1-a) + y*a; }
+#else
+normal mix (normal x, normal y, normal a) BUILTIN;
+normal mix (normal x, normal y, float  a) BUILTIN;
+vector mix (vector x, vector y, vector a) BUILTIN;
+vector mix (vector x, vector y, float  a) BUILTIN;
+point  mix (point  x, point  y, point  a) BUILTIN;
+point  mix (point  x, point  y, float  a) BUILTIN;
+color  mix (color  x, color  y, color  a) BUILTIN;
+color  mix (color  x, color  y, float  a) BUILTIN;
+float  mix (float  x, float  y, float  a) BUILTIN;
+#endif
 int isnan (float x) BUILTIN;
 int isinf (float x) BUILTIN;
 int isfinite (float x) BUILTIN;
@@ -418,8 +433,8 @@ int startswith (string s, string prefix) BUILTIN;
 int endswith (string s, string suffix) BUILTIN;
 string substr (string s, int start, int len) BUILTIN;
 string substr (string s, int start) { return substr (s, start, strlen(s)); }
-float strtof (string str) BUILTIN;
-int strtoi (string str) BUILTIN;
+float stof (string str) BUILTIN;
+int stoi (string str) BUILTIN;
 
 // Define concat in terms of shorter concat
 string concat (string a, string b, string c) {
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 6d9c4e215e6..0d4716ab078 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -378,7 +378,6 @@ __device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *st
 #endif
 
 #ifdef __SUBSURFACE__
-		case CLOSURE_BSSRDF_COMPATIBLE_ID:
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID: {
 			ShaderClosure *sc = &sd->closure[sd->num_closure];
@@ -388,7 +387,7 @@ __device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *st
 			/* disable in case of diffuse ancestor, can't see it well then and
 			 * adds considerably noise due to probabilities of continuing path
 			 * getting lower and lower */
-			if(type != CLOSURE_BSSRDF_COMPATIBLE_ID && (path_flag & PATH_RAY_DIFFUSE_ANCESTOR))
+			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;
 
 			if(sample_weight > 1e-5f && sd->num_closure+2 < MAX_CLOSURE) {
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 50daf159f26..abd63530d63 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -377,7 +377,6 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_TRANSPARENT_ID,
 
 	/* BSSRDF */
-	CLOSURE_BSSRDF_COMPATIBLE_ID,
 	CLOSURE_BSSRDF_CUBIC_ID,
 	CLOSURE_BSSRDF_GAUSSIAN_ID,
 
@@ -402,7 +401,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
-#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_COMPATIBLE_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
+#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_ISOTROPIC_ID)
 #define CLOSURE_IS_EMISSION(type) (type == CLOSURE_EMISSION_ID)
 #define CLOSURE_IS_HOLDOUT(type) (type == CLOSURE_HOLDOUT_ID)
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index dbbfe2ee7c9..7d00ed92164 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -18,7 +18,6 @@ set(SRC
 	background.cpp
 	blackbody.cpp
 	buffers.cpp
-	bssrdf.cpp
 	camera.cpp
 	film.cpp
 	graph.cpp
@@ -47,7 +46,6 @@ set(SRC_HEADERS
 	background.h
 	blackbody.h
 	buffers.h
-	bssrdf.h
 	camera.h
 	film.h
 	graph.h
diff --git a/intern/cycles/render/bssrdf.cpp b/intern/cycles/render/bssrdf.cpp
deleted file mode 100644
index 69cab5e1824..00000000000
--- a/intern/cycles/render/bssrdf.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#include "bssrdf.h"
-
-#include "util_algorithm.h"
-#include "util_math.h"
-#include "util_types.h"
-
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
-
-CCL_NAMESPACE_BEGIN
-
-static float bssrdf_cubic(float ld, float r)
-{
-	if(ld == 0.0f)
-		return (r == 0.0f)? 1.0f: 0.0f;
-
-	return powf(ld - min(r, ld), 3.0f) * 4.0f/powf(ld, 4.0f);
-}
-
-/* Cumulative density function utilities */
-
-static float cdf_lookup_inverse(const vector<float>& table, float2 range, float x)
-{
-	int index = upper_bound(table.begin(), table.end(), x) - table.begin();
-
-	if(index == 0)
-		return range[0];
-	else if(index == table.size())
-		return range[1];
-	else
-		index--;
-	
-	float t = (x - table[index])/(table[index+1] - table[index]);
-	float y = ((index + t)/(table.size() - 1));
-
-	return y*(range[1] - range[0]) + range[0];
-}
-
-static void cdf_invert(vector<float>& to, float2 to_range, const vector<float>& from, float2 from_range)
-{
-	float step = 1.0f/(float)(to.size() - 1);
-
-	for(int i = 0; i < to.size(); i++) {
-		float x = (i*step)*(from_range[1] - from_range[0]) + from_range[0];
-		to[i] = cdf_lookup_inverse(from, to_range, x);
-	}
-}
-
-/* BSSRDF */
-
-static void bssrdf_lookup_table_create(float ld, vector<float>& sample_table, vector<float>& pdf_table)
-{
-	const int size = BSSRDF_RADIUS_TABLE_SIZE;
-	vector<float> cdf(size);
-	vector<float> pdf(size);
-	float step = 1.0f/(float)(size - 1);
-	float max_radius = ld;
-	float pdf_sum = 0.0f;
-
-	/* compute the probability density function */
-	for(int i = 0; i < pdf.size(); i++) {
-		float x = (i*step)*max_radius;
-		pdf[i] = bssrdf_cubic(ld, x);
-		pdf_sum += pdf[i];
-	}
-
-	/* adjust for area covered by each distance */
-	for(int i = 0; i < pdf.size(); i++) {
-		float x = (i*step)*max_radius;
-		pdf[i] *= M_2PI_F*x;
-	}
-
-	/* normalize pdf, we multiply in reflectance later */
-	if(pdf_sum > 0.0f)
-		for(int i = 0; i < pdf.size(); i++)
-			pdf[i] /= pdf_sum;
-
-	/* sum to account for sampling which uses overlapping sphere */
-	for(int i = pdf.size() - 2; i >= 0; i--)
-		pdf[i] = pdf[i] + pdf[i+1];
-
-	/* compute the cumulative density function */
-	cdf[0] = 0.0f;
-
-	for(int i = 1; i < size; i++)
-		cdf[i] = cdf[i-1] + 0.5f*(pdf[i-1] + pdf[i])*step*max_radius;
-	
-	/* invert cumulative density function for importance sampling */
-	float2 cdf_range = make_float2(0.0f, cdf[size - 1]);
-	float2 table_range = make_float2(0.0f, max_radius);
-
-	cdf_invert(sample_table, table_range, cdf, cdf_range);
-
-	/* copy pdf table */
-	for(int i = 0; i < pdf.size(); i++)
-		pdf_table[i] = pdf[i];
-}
-
-void bssrdf_table_build(vector<float>& table)
-{
-	vector<float> sample_table(BSSRDF_RADIUS_TABLE_SIZE);
-	vector<float> pdf_table(BSSRDF_RADIUS_TABLE_SIZE);
-
-	table.resize(BSSRDF_LOOKUP_TABLE_SIZE);
-
-	/* create a 2D lookup table, for reflection x sample radius */
-	for(int i = 0; i < BSSRDF_REFL_TABLE_SIZE; i++) {
-		float radius = 1.0f;
-
-		bssrdf_lookup_table_create(radius, sample_table, pdf_table);
-
-		memcpy(&table[i*BSSRDF_RADIUS_TABLE_SIZE], &sample_table[0], BSSRDF_RADIUS_TABLE_SIZE*sizeof(float));
-		memcpy(&table[BSSRDF_PDF_TABLE_OFFSET + i*BSSRDF_RADIUS_TABLE_SIZE], &pdf_table[0], BSSRDF_RADIUS_TABLE_SIZE*sizeof(float));
-	}
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/render/bssrdf.h b/intern/cycles/render/bssrdf.h
deleted file mode 100644
index b8f0d44bc34..00000000000
--- a/intern/cycles/render/bssrdf.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#ifndef __BSSRDF_H__
-#define __BSSRDF_H__
-
-#include "util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-void bssrdf_table_build(vector<float>& table);
-
-CCL_NAMESPACE_END
-
-#endif /* __BSSRDF_H__ */
-
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 70cb5613e61..621d52bbbbf 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -20,6 +20,7 @@
 #include "osl.h"
 #include "sky_model.h"
 
+#include "util_foreach.h"
 #include "util_transform.h"
 
 CCL_NAMESPACE_BEGIN
@@ -41,6 +42,8 @@ TextureMapping::TextureMapping()
 	y_mapping = Y;
 	z_mapping = Z;
 
+	type = TEXTURE;
+
 	projection = FLAT;
 }
 
@@ -54,12 +57,52 @@ Transform TextureMapping::compute_transform()
 		mmat[1][y_mapping-1] = 1.0f;
 	if(z_mapping != NONE)
 		mmat[2][z_mapping-1] = 1.0f;
+	
+	float3 scale_clamped = scale;
 
-	Transform smat = transform_scale(scale);
+	if(type == TEXTURE || type == NORMAL) {
+		/* keep matrix invertible */
+		if(fabsf(scale.x) < 1e-5f)
+			scale_clamped.x = signf(scale.x)*1e-5f;
+		if(fabsf(scale.y) < 1e-5f)
+			scale_clamped.y = signf(scale.y)*1e-5f;
+		if(fabsf(scale.z) < 1e-5f)
+			scale_clamped.z = signf(scale.z)*1e-5f;
+	}
+	
+	Transform smat = transform_scale(scale_clamped);
 	Transform rmat = transform_euler(rotation);
 	Transform tmat = transform_translate(translation);
 
-	return tmat*rmat*smat*mmat;
+	Transform mat;
+
+	switch(type) {
+		case TEXTURE:
+			/* inverse transform on texture coordinate gives
+			 * forward transform on texture */
+			mat = tmat*rmat*smat;
+			mat = transform_inverse(mat);
+			break;
+		case POINT:
+			/* full transform */
+			mat = tmat*rmat*smat;
+			break;
+		case VECTOR:
+			/* no translation for vectors */
+			mat = rmat*smat;
+			break;
+		case NORMAL:
+			/* no translation for normals, and inverse transpose */
+			mat = rmat*smat;
+			mat = transform_inverse(mat);
+			mat = transform_transpose(mat);
+			break;
+	}
+
+	/* projection last */
+	mat = mat*mmat;
+
+	return mat;
 }
 
 bool TextureMapping::skip()
@@ -97,6 +140,11 @@ void TextureMapping::compile(SVMCompiler& compiler, int offset_in, int offset_ou
 		compiler.add_node(float3_to_float4(min));
 		compiler.add_node(float3_to_float4(max));
 	}
+
+	if(type == NORMAL) {
+		compiler.add_node(NODE_VECTOR_MATH, NODE_VECTOR_MATH_NORMALIZE, offset_out, offset_out);
+		compiler.add_node(NODE_VECTOR_MATH, SVM_STACK_INVALID, offset_out);
+	}
 }
 
 void TextureMapping::compile(OSLCompiler &compiler)
@@ -472,6 +520,9 @@ static void sky_texture_precompute_new(SunSky *sunsky, float3 dir, float turbidi
 	float theta = spherical.x;
 	float phi = spherical.y;
 	
+	/* Clamp Turbidity */
+	turbidity = clamp(turbidity, 0.0f, 10.0f); 
+	
 	/* Clamp to Horizon */
 	theta = clamp(theta, 0.0f, M_PI_2_F); 
 
@@ -3677,6 +3728,26 @@ void OSLScriptNode::compile(SVMCompiler& compiler)
 
 void OSLScriptNode::compile(OSLCompiler& compiler)
 {
+	/* XXX fix for #36790:
+	 * point and normal parameters are reflected as generic SOCK_VECTOR sockets
+	 * on the node. Socket fixed input values need to be copied explicitly here for
+	 * vector sockets, otherwise OSL will reject the value due to mismatching type.
+	 */
+	foreach(ShaderInput *input, this->inputs) {
+		if(!input->link) {
+			/* no need for compatible_name here, OSL parameter names are always unique */
+			string param_name(input->name);
+			switch(input->type) {
+				case SHADER_SOCKET_VECTOR:
+					compiler.parameter_point(param_name.c_str(), input->value);
+					compiler.parameter_normal(param_name.c_str(), input->value);
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
 	if(!filepath.empty())
 		compiler.add(this, filepath.c_str(), true);
 	else
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index d58c6633a41..430c37158f4 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -43,6 +43,9 @@ public:
 	float3 min, max;
 	bool use_minmax;
 
+	enum Type { POINT = 0, TEXTURE = 1, VECTOR = 2, NORMAL = 3 };
+	Type type;
+
 	enum Mapping { NONE = 0, X = 1, Y = 2, Z = 3 };
 	Mapping x_mapping, y_mapping, z_mapping;
 
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 385395e3752..26af60572f6 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -15,7 +15,6 @@
  */
 
 #include "background.h"
-#include "bssrdf.h"
 #include "blackbody.h"
 #include "device.h"
 #include "graph.h"
@@ -127,7 +126,6 @@ void Shader::tag_used(Scene *scene)
 ShaderManager::ShaderManager()
 {
 	need_update = true;
-	bssrdf_table_offset = TABLE_OFFSET_INVALID;
 	blackbody_table_offset = TABLE_OFFSET_INVALID;
 }
 
@@ -254,23 +252,6 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 
 	device->tex_alloc("__shader_flag", dscene->shader_flag);
 
-	/* bssrdf lookup table */
-	KernelBSSRDF *kbssrdf = &dscene->data.bssrdf;
-
-	if(has_surface_bssrdf && bssrdf_table_offset == TABLE_OFFSET_INVALID) {
-		vector<float> table;
-
-		bssrdf_table_build(table);
-		bssrdf_table_offset = scene->lookup_tables->add_table(dscene, table);
-
-		kbssrdf->table_offset = (int)bssrdf_table_offset;
-		kbssrdf->num_attempts = BSSRDF_MAX_ATTEMPTS;
-	}
-	else if(!has_surface_bssrdf && bssrdf_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(bssrdf_table_offset);
-		bssrdf_table_offset = TABLE_OFFSET_INVALID;
-	}
-
 	/* blackbody lookup table */
 	KernelBlackbody *kblackbody = &dscene->data.blackbody;
 	
@@ -289,11 +270,6 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
 {
-	if(bssrdf_table_offset != TABLE_OFFSET_INVALID) {
-		scene->lookup_tables->remove_table(bssrdf_table_offset);
-		bssrdf_table_offset = TABLE_OFFSET_INVALID;
-	}
-
 	if(blackbody_table_offset != TABLE_OFFSET_INVALID) {
 		scene->lookup_tables->remove_table(blackbody_table_offset);
 		blackbody_table_offset = TABLE_OFFSET_INVALID;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index a58d6955da0..6869a651b46 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -141,7 +141,6 @@ protected:
 	typedef unordered_map<ustring, uint, ustringHash> AttributeIDMap;
 	AttributeIDMap unique_attribute_id;
 
-	size_t bssrdf_table_offset;
 	size_t blackbody_table_offset;
 };
 
diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h
index 07a296ed1e6..04a3e039c9d 100644
--- a/intern/cycles/util/util_opengl.h
+++ b/intern/cycles/util/util_opengl.h
@@ -20,12 +20,7 @@
 /* OpenGL header includes, used everywhere we use OpenGL, to deal with
  * platform differences in one central place. */
 
-#ifdef __APPLE__
-#include <OpenGL/gl.h>
-#include <OpenGL/glu.h>
-#else
 #include <GL/glew.h>
-#endif
 
 #endif /* __UTIL_OPENGL_H__ */
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index f48fd1e124b..a8f514864db 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -545,10 +545,7 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m12
 
 #ifdef __KERNEL_OPENCL__
 
-__device_inline void float4_store_half(half *h, const float4 *f, float scale)
-{
-	vstore_half4(*f * scale, 0, h);
-}
+#define float4_store_half(h, f, scale) vstore_half4(*(f) * (scale), 0, h);
 
 #else